PROJECT 2 DATA 607 MSDS

Preparing different datasets for downstream analysis work.

#——————————————————————————————————-

Ron Balaban

Possibly needed libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(tidyr)
library(dplyr)
library(ggplot2)

Dataset 1; AirBnB

(Found here: https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data)
airbnb_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/AB_NYC_2019.csv"
airbnb_raw <- read.csv(airbnb_url, header = TRUE, stringsAsFactors = FALSE)
head(airbnb_raw)
##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0                            NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129

Transform to dataframe

airbnb_df <- as.data.frame(airbnb_raw)  
airbnb_df <- subset(airbnb_df, price > 0) # Remove those with price missing
head(airbnb_df)
##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0                            NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129

Prices overall

ggplot(airbnb_df, aes(x = price)) +
  geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +  # A solid scale like above seems to miss some data
  geom_vline(xintercept = median(airbnb_df$price), color = "red", linewidth = 1, linetype = "solid") +
  geom_vline(xintercept = mean(airbnb_df$price), color = "darkgreen", linewidth = 1, linetype = "solid") +
  annotate("text", x = 500, y = 2500, label = "Median Price =  $106", color = "red", size = 5) +
  annotate("text", x = 500, y = 3500, label = "Mean Price =  $163", color = "darkgreen", size = 5) +
  labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices")
## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

We can see that the majority of the data for AirBnB lies within price range of 0 to 300, with a few outliers. Will revisit the outliers later.

Frequency of Prices per neighborhood

# Get Mean and Median prices for each neighborhood
price_neighborhood_group <- airbnb_df %>% 
  group_by(neighbourhood_group) %>% 
  dplyr::summarise(median_price = round(median(price),0),
                   mean_price   = round(mean(price),0))


# Plot- a more detailed breakdown of the prior graph.
ggplot(airbnb_df, aes(x = price)) +
  geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +  
  labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices per Neighborhood") +
  facet_wrap(~neighbourhood_group) +
  geom_text(data = price_neighborhood_group, y = 3000, aes(x = 500, label = paste("Mean Price = $", mean_price)), color = "darkgreen", size = 3) +
  geom_text(data = price_neighborhood_group, y = 2500, aes(x = 500, label = paste("Median Price = $", median_price)), color = "red", size = 3)
## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 10 rows containing missing values (`geom_bar()`).

Price breakdown for AirBnB rentals per neighborhood. As expected, the majority of the data is contained within Brooklyn and Manhattan.

Price versus reviews

ggplot(airbnb_df, aes(x = number_of_reviews, y = price, color = room_type)) +
  #geom_point(alpha = 0.5, color = "blue") +
  geom_point() +
  facet_wrap(~neighbourhood_group, scales = "free") +
  scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
  #scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
  labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews")

You can see that a majority of rentals are for people taking the entire place to themself, as very few would want to share a room or apartment on their vacation. However, there does not seem to be a strong correlation between how many reviews a place has and its price, as there are many AirBnBs with a low price and few reviews. Let’s look at this again, but where the plot is denser and we have more data to view- let’s cap the price at 1000

Price versus reviews below 1000$

airbnb_df_below1000 <- airbnb_df %>% 
  filter(price < 1000)

ggplot(airbnb_df_below1000, aes(x = number_of_reviews, y = price, color = room_type)) +
  #geom_point(alpha = 0.5, color = "blue") +
  geom_point() +
  facet_wrap(~neighbourhood_group, scales = "free") +
  scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
  #scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
  labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB below 1000$")

#-------------------------------------------------------------------------------
# Looking at the outliers for price vs reviews
airbnb_manyreviews_highprice <- airbnb_df %>% 
  filter(number_of_reviews > 400 & price > 500)

ggplot(airbnb_manyreviews_highprice, aes(x = number_of_reviews, y = price)) +
  #geom_point(alpha = 0.5, color = "blue") +
  geom_point() +
  geom_text(label = airbnb_manyreviews_highprice$name) +
  facet_wrap(~neighbourhood_group, scales = "free") +
  scale_x_continuous(limits = c(446, 448), breaks = seq(446,448, by = 1)) +
  scale_y_continuous(limits = c(574, 576), breaks = seq(574,576, by = 1)) +
  labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB over 1000$")

Here, we have a better view of the data. It seems that the 2 fields don’t really have a solid correlation on each other, as there are many rentals with a low price, but few reviews. There are also many rentals with a low price, but many reviews. However, what is obvious, is that as the price increases, there will generally be less reviews as fewer customers will want to rent the unit, and hence there will be less reviews.
Interestingly enough, there is an AirBnB in Manhattan that has ~450 reviews and costs ~600. Looking into why this one AirBnB stands out so much, it becomes obvious; it’s a beautiful 2500 sq. ft. apartment with its own elevator, which out of curiosity I found on Google Maps;

AirBnB types per neighborhood

airbnb_neighborhood_homes <-airbnb_df %>%
  group_by(neighbourhood_group) %>%
  count(room_type)

airbnb_neighborhood_homes   
## # A tibble: 15 × 3
## # Groups:   neighbourhood_group [5]
##    neighbourhood_group room_type           n
##    <chr>               <chr>           <int>
##  1 Bronx               Entire home/apt   379
##  2 Bronx               Private room      651
##  3 Bronx               Shared room        60
##  4 Brooklyn            Entire home/apt  9558
##  5 Brooklyn            Private room    10126
##  6 Brooklyn            Shared room       411
##  7 Manhattan           Entire home/apt 13198
##  8 Manhattan           Private room     7982
##  9 Manhattan           Shared room       480
## 10 Queens              Entire home/apt  2096
## 11 Queens              Private room     3372
## 12 Queens              Shared room       198
## 13 Staten Island       Entire home/apt   176
## 14 Staten Island       Private room      188
## 15 Staten Island       Shared room         9
# Types of Airbnb's in each neighborhood
ggplot(airbnb_neighborhood_homes, aes(x = neighbourhood_group, y = n, fill = room_type)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Types of Airbnb's", x = "Neighbourhood", y = "Count") +
  coord_flip()  

Once again, Brooklyn and Manhattan are the majority of rentals, and the majority are either for the entire place, or having their own room. In which case, they are probably sharing a place with their friends for vacation.

Types of Airbnb’s in each Neighborhood

ggplot(data = airbnb_df) + 
  geom_bar(mapping = aes(x=room_type, fill=neighbourhood_group), position = "dodge") +
  labs(title = "AirBnB Listings per Neighborhood ", x = "Listings", y = "Count") +
  coord_flip()  

AirBnb prices in each neighborhood

# By neighborhood
airbnb_neighborhood_prices <-airbnb_df %>%
  group_by(neighbourhood_group, neighbourhood) %>% 
  summarize(avg_price = mean(price),
            min_price = min(price), 
            max_price = max(price))
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
ggplot(data = airbnb_neighborhood_prices, aes(x = neighbourhood, y = avg_price, color = neighbourhood)) + 
  geom_point() +
  labs(title = " Average Prices per neighborhood") +
  xlab("Neighborhood") + ylab("Average Price") + 
  theme(axis.text.x=element_blank()) + # axis.text.x=element_blank() removes all neighborhood names
  theme(legend.position = "none") +
  facet_wrap(~neighbourhood_group) 

The majority of the more expensive places are in Manhattan, with some very odd outliers with a price over 400 in the other boroughs of NYC. This is because of some ridiculous outliers that cost more than 2500, as seen in ‘Price versus reviews’
outliers <- airbnb_df %>% 
  filter(price >= 2500)

outliers  %>% 
  group_by(neighbourhood_group, neighbourhood) %>% 
  arrange(price)
## # A tibble: 77 × 16
## # Groups:   neighbourhood_group, neighbourhood [36]
##          id name    host_id host_name neighbourhood_group neighbourhood latitude
##       <int> <chr>     <int> <chr>     <chr>               <chr>            <dbl>
##  1   893413 "Archi…  4.75e6 Martin    Manhattan           East Village      40.7
##  2  2276383 "Penth…  1.16e7 Mike      Manhattan           Greenwich Vi…     40.7
##  3 12339863 "Loft"   1.00e7 Claudine  Manhattan           Tribeca           40.7
##  4 14408114 "Unpar…  8.36e5 Henry     Manhattan           Midtown           40.8
##  5 19554980 "A Coz…  9.78e7 Logan     Manhattan           Upper West S…     40.8
##  6 19698169 "\"The…  1.32e8 Kathy     Bronx               Riverdale         40.9
##  7 23373090 "SHOOT…  1.18e7 V         Brooklyn            Williamsburg      40.7
##  8 25018204 "Parad…  1.73e8 Rasmus    Manhattan           Harlem            40.8
##  9 31470004 "Priva…  7.12e7 Max       Manhattan           East Village      40.7
## 10 34592851 "Beaut…  1.99e7 Cheryl    Brooklyn            Crown Heights     40.7
## # ℹ 67 more rows
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <int>,
## #   minimum_nights <int>, number_of_reviews <int>, last_review <chr>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <int>,
## #   availability_365 <int>
ggplot(data = outliers, aes(x = price, fill = neighbourhood_group)) + 
  geom_histogram(bins= 100) +
  labs(title = "Outlier Prices per neighborhood (2500$ +)") +
  xlab("Price") + ylab("Frequency") + 
  scale_x_continuous(limits = c(2000, 10500), breaks = seq(2000, 10000, by = 1000)) +
  scale_y_continuous(limits = c(0, 12), breaks = seq(0, 12, by = 2))
## Warning: Removed 10 rows containing missing values (`geom_bar()`).

These outliers are what causes the increase in the mean prices.

AirBnB listings in Brooklyn & Manhattan

airbnb_Brooklyn <- airbnb_df%>%
  filter(neighbourhood_group == "Brooklyn")

ggplot(airbnb_Brooklyn, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Listings in Brooklyn", x = "Listings", y = "Neighbourhood") 

#-------------------------------------------------------------------------------
airbnb_Manhattan <- airbnb_df%>%
  filter(neighbourhood_group == "Manhattan")

ggplot(airbnb_Manhattan, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Listings in Manhattan", x = "Listings", y = "Neighbourhood") 

Dataset 2; Spotify

(Found here: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023)
spotify_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/spotify-2023.csv"
spotify_raw <- read.csv(spotify_url, header = TRUE, stringsAsFactors = FALSE)

# Make into dataframe
spotify_df <- as.data.frame(spotify_raw)  
head(spotify_df)
##                            track_name    artist.s._name artist_count
## 1 Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook            2
## 2                                LALA       Myke Towers            1
## 3                             vampire    Olivia Rodrigo            1
## 4                        Cruel Summer      Taylor Swift            1
## 5                      WHERE SHE GOES         Bad Bunny            1
## 6                            Sprinter Dave, Central Cee            2
##   released_year released_month released_day in_spotify_playlists
## 1          2023              7           14                  553
## 2          2023              3           23                 1474
## 3          2023              6           30                 1397
## 4          2019              8           23                 7858
## 5          2023              5           18                 3133
## 6          2023              6            1                 2186
##   in_spotify_charts   streams in_apple_playlists in_apple_charts
## 1               147 141381703                 43             263
## 2                48 133716286                 48             126
## 3               113 140003974                 94             207
## 4               100 800840817                116             207
## 5                50 303236322                 84             133
## 6                91 183706234                 67             213
##   in_deezer_playlists in_deezer_charts in_shazam_charts bpm key  mode
## 1                  45               10              826 125   B Major
## 2                  58               14              382  92  C# Major
## 3                  91               14              949 138   F Major
## 4                 125               12              548 170   A Major
## 5                  87               15              425 144   A Minor
## 6                  88               17              946 141  C# Major
##   danceability_. valence_. energy_. acousticness_. instrumentalness_.
## 1             80        89       83             31                  0
## 2             71        61       74              7                  0
## 3             51        32       53             17                  0
## 4             55        58       72             11                  0
## 5             65        23       80             14                 63
## 6             92        66       58             19                  0
##   liveness_. speechiness_.
## 1          8             4
## 2         10             4
## 3         31             6
## 4         11            15
## 5         11             6
## 6          8            24
This dataset contains a comprehensive list of the most famous songs of 2023 as listed on Spotify. The dataset offers a wealth of features beyond what is typically available in similar datasets. It provides insights into each song’s attributes, popularity, and presence on various music platforms. The dataset includes information such as track name, artist(s) name, release date, Spotify playlists and charts, streaming statistics, Apple Music presence, Deezer presence, Shazam charts, and various audio features.

Removing messy data

# The data has lots of odd � errors (replacement character; https://charbase.com/fffd-unicode-replacement-character), mostly for  Hispanic  accents. 
library(stringi)

# Convert all non-ASCII characters to spaces in the song name
spotify_df[,1] <- stringi::stri_trans_general(spotify_df[,1], "Latin-ASCII")
spotify_df[,1] <- iconv(spotify_df[,1], to = "ASCII", sub = "")

# Convert all non-ASCII characters to spaces in the artist's name
spotify_df[,2] <- stringi::stri_trans_general(spotify_df[,2], "Latin-ASCII")
spotify_df[,2] <- iconv(spotify_df[,2], to = "ASCII", sub = "")

Dataset 3; Diabetes

(Found here: https://www.kaggle.com/datasets/aemyjutt/diabetesdataanslysis)
diabetes_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/diabetes.csv"
diabetes_raw <- read.csv(diabetes_url, header = TRUE, stringsAsFactors = FALSE)

# Make into dataframe
diabetes_df <- as.data.frame(diabetes_raw)  
head(diabetes_df)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

Change 0/1 Diabetes diagnosis to Negative/Positive diagnosis

diabetes_df <- diabetes_df %>% 
   mutate(Outcome = ifelse(Outcome == '1', "Positive", "Negative"))

head(diabetes_df)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age  Outcome
## 1                    0.627  50 Positive
## 2                    0.351  31 Negative
## 3                    0.672  32 Positive
## 4                    0.167  21 Negative
## 5                    2.288  33 Positive
## 6                    0.201  30 Negative

Address missing values (Except for Pregnancy)

diabetes_df <-  diabetes_df %>%
  #mutate(across(.cols = Glucose:Age, .fns = ~ifelse(.x == 0, round(mean(.x), digits = 0), .x)))
  mutate(across(.cols = Glucose:Age, ~ replace(., . == 0, round(mean(., na.rm = TRUE),0))))
# Didn't mutate to zero-values as it could be men.

head(diabetes_df)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35      80 33.6
## 2           1      85            66            29      80 26.6
## 3           8     183            64            21      80 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74            21      80 25.6
##   DiabetesPedigreeFunction Age  Outcome
## 1                    0.627  50 Positive
## 2                    0.351  31 Negative
## 3                    0.672  32 Positive
## 4                    0.167  21 Negative
## 5                    2.288  33 Positive
## 6                    0.201  30 Negative

Basic histograms vs Diabetes Diagnosis

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:formattable':
## 
##     style
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
g_preg <- ggplot(diabetes_df, aes(x=Pregnancies, fill = Outcome)) +
  geom_histogram(binwidth = 0.5) + 
  theme(legend.position = "none")

g_glucose <- ggplot(diabetes_df, aes(x=Glucose, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_bp <-ggplot(diabetes_df, aes(x=BloodPressure, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_skin <-ggplot(diabetes_df, aes(x=SkinThickness, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_insulin <-ggplot(diabetes_df, aes(x=Insulin, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_bmi <-ggplot(diabetes_df, aes(x=BMI, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_dpf <-ggplot(diabetes_df, aes(x=DiabetesPedigreeFunction, fill = Outcome)) +
   geom_histogram(binwidth = 0.05) + 
   theme(legend.position = "none")

g_age <-ggplot(diabetes_df, aes(x=Age, fill = Outcome)) +
   geom_histogram() + 
   theme(legend.position = "none")

g_diabetes <-ggplot(diabetes_df, aes(x=Outcome, fill = Outcome)) +
   geom_histogram(stat="count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
#-------------------------------------------------------------------------------
library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:formattable':
## 
##     area
# https://patchwork.data-imaginist.com/

(g_preg + g_glucose + g_bp ) /
(g_skin + g_insulin + g_bmi) /
(g_dpf  + g_age     + g_diabetes)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What is immediately evident after looking at these distributions, is that you see higher rates of positive diabetes in people with higher glucose, as there doesn’t seem to be a strong correlation between the other fields. I attempted to find the correlation between the other 8 fields with the diabetes outcome, but seeing as it is a character, I’d need to use logistic regression to measure it compared to the numeric features, by having the categorical feature as a target and numeric as an input. Alternatively, use the Chi-Square test and ANOVA (Analysis of Variance) (https://datascience.stackexchange.com/questions/893/how-to-get-correlation-between-two-categorical-variable-and-a-categorical-variab)

Attempt below, but was unsuccessful.

#categorical_var <- factor(c("Positive", "Negative"))
# Convert the categorical variable to a binary variable
#binary_var <- as.numeric(categorical_var) - 1
# Calculate point-biserial correlation
#correlation <- cor(binary_var, diabetes_df$Pregnancies)
#-------------------------------------------------------------------------------
# ANOVA attempt
#anova_result <- aov(diabetes_df$Pregnancies ~ categorical_var)

#-------------------------------------------------------------------------------
# Base correlations won't work here.
#cor(diabetes_df$Outcome, diabetes_df$Pregnancies)
#cor(diabetes_df$Outcome, diabetes_df$Glucose)
#cor(diabetes_df$Outcome, diabetes_df$BloodPressure)
#cor(diabetes_df$Outcome, diabetes_df$SkinThickness)
#cor(diabetes_df$Outcome, diabetes_df$Insulin)
#cor(diabetes_df$Outcome, diabetes_df$BMI)
#cor(diabetes_df$Outcome, diabetes_df$DiabetesPedigreeFunction)
#cor(diabetes_df$Outcome, diabetes_df$Age)

Those who have diabetes (268/ 768)

diabetes_positive <- diabetes_df %>%
  filter(Outcome == "Positive")

diabetes_positive  %>%
  group_by(Glucose, Age, BloodPressure) %>% 
        arrange(desc(Glucose))
## # A tibble: 268 × 9
## # Groups:   Glucose, Age, BloodPressure [268]
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <int>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           1     199            76            43      80  42.9
##  2           0     198            66            32     274  41.3
##  3           2     197            70            45     543  30.5
##  4           8     197            74            21      80  25.9
##  5           2     197            70            99      80  34.7
##  6           7     196            90            21      80  39.8
##  7           8     196            76            29     280  37.5
##  8           1     196            76            36     249  36.5
##  9           7     195            70            33     145  25.1
## 10           6     195            70            21      80  30.9
## # ℹ 258 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <chr>

#——————————————————————————————————- # Folorunsho Atanda

Load required library

library(tidyverse)
library(RMySQL)
## Loading required package: DBI
library(dotenv)

Create database connection

dotenv::load_dot_env(file = "sql_pw.env")
my_sql_pw <- Sys.getenv("MYSQL_PW")

db <- dbConnect(
  MySQL(),
  user = "root",
  #password = my_sql_pw,
  password = "0830spscuny2023!!!",
  dbname = "project_2",
  host = "localhost",
  port = 3306
)

Load table from database as a data frame

diabetes <- db %>% 
  dbGetQuery("select * from project_2.diabetes")

diabetes_df <- as_tibble(diabetes)

# Disconnect database
db_status <- dbDisconnect(db)

Because 0 can’t exist in the columns Glucose, BloodPressure, SkinThickness, Insulin, BMI, DPF and Age, going to change does to NA. Then also change 0 to Negative and 1 to Positive in Outcome

columns_to_modify <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DPF", "Age")

diabetes_df <- diabetes_df %>% 
  mutate(across(all_of(columns_to_modify), ~if_else(. == 0, NA, .))) %>% 
  mutate("Outcome" = ifelse(`Outcome` == 0, "Negative", "Positive"))

Summarise data

glimpse(diabetes_df)
## Rows: 768
## Columns: 10
## $ ID            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ Pregnancies   <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7,…
## $ Glucose       <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168,…
## $ BloodPressure <int> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, …
## $ SkinThickness <int> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, …
## $ Insulin       <int> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 84…
## $ BMI           <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA…
## $ DPF           <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, …
## $ Age           <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, …
## $ Outcome       <chr> "Positive", "Negative", "Positive", "Negative", "Positiv…
summary(diabetes_df)
##        ID         Pregnancies        Glucose      BloodPressure   
##  Min.   :  1.0   Min.   : 0.000   Min.   : 44.0   Min.   : 24.00  
##  1st Qu.:192.8   1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00  
##  Median :384.5   Median : 3.000   Median :117.0   Median : 72.00  
##  Mean   :384.5   Mean   : 3.845   Mean   :121.7   Mean   : 72.41  
##  3rd Qu.:576.2   3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00  
##  Max.   :768.0   Max.   :17.000   Max.   :199.0   Max.   :122.00  
##                                   NA's   :5       NA's   :35      
##  SkinThickness      Insulin            BMI             DPF        
##  Min.   : 7.00   Min.   : 14.00   Min.   :18.20   Min.   :0.0780  
##  1st Qu.:22.00   1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437  
##  Median :29.00   Median :125.00   Median :32.30   Median :0.3725  
##  Mean   :29.15   Mean   :155.55   Mean   :32.46   Mean   :0.4719  
##  3rd Qu.:36.00   3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262  
##  Max.   :99.00   Max.   :846.00   Max.   :67.10   Max.   :2.4200  
##  NA's   :227     NA's   :374      NA's   :11                      
##       Age          Outcome         
##  Min.   :21.00   Length:768        
##  1st Qu.:24.00   Class :character  
##  Median :29.00   Mode  :character  
##  Mean   :33.24                     
##  3rd Qu.:41.00                     
##  Max.   :81.00                     
## 

Compare Variables

Pregnancies vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Pregnancies`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge", 
    na.rm = TRUE) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Pregnancies`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    ) + 
  labs(title = "Pregnancies vs Outcome")

diabetes_df %>% 
  select(c(`Pregnancies`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Pregnancies_mean = mean(`Pregnancies`), 
    Pregnancies_median = median(`Pregnancies`),
    Pregnancies_Q1 = quantile(`Pregnancies`, probs = 0.25),
    Pregnancies_Q2 = quantile(`Pregnancies`, probs = 0.75)
    )
## # A tibble: 2 × 5
##   Outcome  Pregnancies_mean Pregnancies_median Pregnancies_Q1 Pregnancies_Q2
##   <chr>               <dbl>              <dbl>          <dbl>          <dbl>
## 1 Negative             3.30                  2           1                 5
## 2 Positive             4.87                  4           1.75              8

Analysis: The distribution of Pregnancies to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

Glucose vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Glucose`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Glucose`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) + 
  labs(title = "Glucose vs Outcome")

diabetes_df %>% 
  select(c(`Glucose`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Glucose_mean = mean(`Glucose`, na.rm = TRUE), 
    Glucose_median = median(`Glucose`, na.rm = TRUE),
    Glucose_Q1 = quantile(`Glucose`, probs = 0.25, na.rm = TRUE),
    Glucose_Q2 = quantile(`Glucose`, probs = 0.75, na.rm = TRUE)
    )
## # A tibble: 2 × 5
##   Outcome  Glucose_mean Glucose_median Glucose_Q1 Glucose_Q2
##   <chr>           <dbl>          <dbl>      <dbl>      <dbl>
## 1 Negative         111.            107         93        125
## 2 Positive         142.            140        119        167

Analysis: The distribution of Glucose to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 93 and 167.

Blood Pressure vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `BloodPressure`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `BloodPressure`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) 

diabetes_df %>% 
  select(c(`BloodPressure`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    BloodPressure_mean = mean(`BloodPressure`, na.rm = TRUE), 
    BloodPressure_median = median(`BloodPressure`, na.rm = TRUE),
    BloodPressure_Q1 = quantile(`BloodPressure`, probs = 0.25, na.rm = TRUE),
    BloodPressure_Q2 = quantile(`BloodPressure`, probs = 0.75, na.rm = TRUE)
    )
## # A tibble: 2 × 5
##   Outcome  BloodPressure_mean BloodPressure_median BloodPressure_Q1
##   <chr>                 <dbl>                <dbl>            <dbl>
## 1 Negative               70.9                 70                 62
## 2 Positive               75.3                 74.5               68
## # ℹ 1 more variable: BloodPressure_Q2 <dbl>

Analysis: The distribution of BloodPressure to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 62 and 84.

Skin Thickness vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `SkinThickness`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `SkinThickness`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) 

diabetes_df %>% 
  select(c(`SkinThickness`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    SkinThickness_mean = mean(`SkinThickness`, na.rm = TRUE), 
    SkinThickness_median = median(`SkinThickness`, na.rm = TRUE),
    SkinThickness_Q1 = quantile(`SkinThickness`, probs = 0.25, na.rm = TRUE),
    SkinThickness_Q2 = quantile(`SkinThickness`, probs = 0.75, na.rm = TRUE)
    )
## # A tibble: 2 × 5
##   Outcome  SkinThickness_mean SkinThickness_median SkinThickness_Q1
##   <chr>                 <dbl>                <dbl>            <dbl>
## 1 Negative               27.2                   27               19
## 2 Positive               33                     32               27
## # ℹ 1 more variable: SkinThickness_Q2 <dbl>

Analysis: The distribution of SkinThickness to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 19 and 39.

Insulin vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Insulin`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Insulin`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) 

diabetes_df %>% 
  select(c(`Insulin`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Insulin_mean = mean(`Insulin`, na.rm = TRUE), 
    Insulin_median = median(`Insulin`, na.rm = TRUE),
    Insulin_Q1 = quantile(`Insulin`, probs = 0.25, na.rm = TRUE),
    Insulin_Q2 = quantile(`Insulin`, probs = 0.75, na.rm = TRUE)
    )
## # A tibble: 2 × 5
##   Outcome  Insulin_mean Insulin_median Insulin_Q1 Insulin_Q2
##   <chr>           <dbl>          <dbl>      <dbl>      <dbl>
## 1 Negative         130.           102.        66        161.
## 2 Positive         207.           170.       128.       239.

Analysis: The distribution of Insulin to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

BMI vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `BMI`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `BMI`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) 

diabetes_df %>% 
  select(c(`BMI`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    BMI_mean = mean(`BMI`, na.rm = TRUE), 
    BMI_median = median(`BMI`, na.rm = TRUE),
    BMI_Q1 = quantile(`BMI`, probs = 0.25, na.rm = TRUE),
    BMI_Q2 = quantile(`BMI`, probs = 0.75, na.rm = TRUE)
    )
## # A tibble: 2 × 5
##   Outcome  BMI_mean BMI_median BMI_Q1 BMI_Q2
##   <chr>       <dbl>      <dbl>  <dbl>  <dbl>
## 1 Negative     30.9       30.1   25.6   35.3
## 2 Positive     35.4       34.3   30.9   38.9

Analysis: The distribution of BMI to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 19 and 39.

DPF vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `DPF`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `DPF`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) 

diabetes_df %>% 
  select(c(`DPF`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    DPF_mean = mean(`DPF`, na.rm = TRUE), 
    DPF_median = median(`DPF`, na.rm = TRUE),
    DPF_Q1 = quantile(`DPF`, probs = 0.25, na.rm = TRUE),
    DPF_Q2 = quantile(`DPF`, probs = 0.75, na.rm = TRUE)
    )
## # A tibble: 2 × 5
##   Outcome  DPF_mean DPF_median DPF_Q1 DPF_Q2
##   <chr>       <dbl>      <dbl>  <dbl>  <dbl>
## 1 Negative    0.430      0.336  0.230  0.562
## 2 Positive    0.550      0.449  0.262  0.728

Analysis: The distribution of DPF to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

Age vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Age`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Age`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) 

diabetes_df %>% 
  select(c(`Age`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Age_mean = mean(`Age`), 
    Age_median = median(`Age`),
    Age_Q1 = quantile(`Age`, probs = 0.25),
    Age_Q2 = quantile(`Age`, probs = 0.75)
    )
## # A tibble: 2 × 5
##   Outcome  Age_mean Age_median Age_Q1 Age_Q2
##   <chr>       <dbl>      <dbl>  <dbl>  <dbl>
## 1 Negative     31.2         27     23     37
## 2 Positive     37.1         36     28     44

Analysis: The distribution of Age to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

Correlation

Because Glucose, BloodPressure, and BMI have normal distribution, I want to see if there is a correlation between them, focusing on a positive diabetes Outcome.

diabetes_cor <- diabetes_df %>% 
  select(c(`Glucose`, `BloodPressure`, `BMI`, `Outcome`)) %>% 
  filter(`Outcome` == "Positive") 

Glucose_BloodPressure_cor <- cor(
  x = as.numeric(diabetes_cor$Glucose),  
  y = as.numeric(diabetes_cor$BloodPressure), 
  use = "na.or.complete",
  method = "pearson"
  )

Glucose_BMI_cor <- cor(
  x = as.numeric(diabetes_cor$Glucose),  
  y = as.numeric(diabetes_cor$BMI), 
  use = "na.or.complete", 
  method = "pearson"
  )

BloodPressure_BMI_cor <- cor(
  x = as.numeric(diabetes_cor$BloodPressure),  
  y = as.numeric(diabetes_cor$BMI), 
  use = "na.or.complete",
  method = "pearson"
  )

correlation_df <- tibble(
  Relationship = c("Glucose vs Blood Pressure", "Glucose vs BMI",
                   "Blood Pressure vs BMI"),
  Correlation = c(Glucose_BloodPressure_cor, Glucose_BMI_cor, 
                  BloodPressure_BMI_cor)
)

correlation_df
## # A tibble: 3 × 2
##   Relationship              Correlation
##   <chr>                           <dbl>
## 1 Glucose vs Blood Pressure      0.101 
## 2 Glucose vs BMI                 0.0566
## 3 Blood Pressure vs BMI          0.249

Analysis: From the table we see that there is no correlation between these three variables.

#——————————————————————————————————-

XHULIA TURKAJ

I have decided to work with the New York Prison Employee Discipline Data from The Marshall Project Found here: (https://observablehq.com/@themarshallproject/new-york-prison-employee-discipline-data)

The data is originally retrieved from the NY State Department of Corrections and Community Supervision

employee_discipline_url <- "https://raw.githubusercontent.com/xhuliaturkaj/Project_2/main/NYPrisonEmployeeDiscipline.csv"
employee_discipline <- read.csv(employee_discipline_url, header = TRUE)
head(employee_discipline)
##            LNAME    FNAME   TITLE      FACILITY    UNION MISCONDUCT    CLOSED
## 1      STEVENSON  SYLBERN      CO     EDGECOMBE NYSCOPBA     FD, NG 01-Jul-20
## 2         BODDEN  MICHAEL      CO   GREEN HAVEN NYSCOPBA     IA, FD 01-Jul-20
## 3 GIVANS-DANIELS   GLENDA      CO BEDFORD HILLS NYSCOPBA         DI 06-Jul-20
## 4      ROUANTREE    KEITH      CO   GREEN HAVEN NYSCOPBA FD, IA, NG 06-Jul-20
## 5       MELENDEZ     IRIS      CO     COXSACKIE NYSCOPBA         IN 06-Jul-20
## 6           BUSH JONATHAN ASAT-PA        ALBION      PEF         IN 08-Jul-20
##   REASON              PENALTY
## 1    NOD DISMISSAL & ACCRUALS
## 2    NOD DISMISSAL & ACCRUALS
## 3    NOD DISMISSAL & ACCRUALS
## 4    NOD DISMISSAL & ACCRUALS
## 5    NOD DISMISSAL & ACCRUALS
## 6    NOD               30D SP
##                                                                                                                                                                                                                                                            Field0
## 1                                                  MADE FALSE/MISLEADING ENTRIES INTO LOGBOOK. DID NOT PERFORM SECURITY ROUNDS ON 5/16/20, 5/17/20 AND 5/19/20 . FAILED TO MAINTAIN SECURE CONTROL OF SECURITY EQUIPMENT (LOST KEY RING  AND COULD NOT LOCATE IT)
## 2                                                                                               ENGAGED IN UNJUSTIFIED AND EXCESSIVE USE OF FORCE ON AN INMATE, THREATENED INMATE, MADE FALSE AND/OR MISLEADING INFORMATION REGARDING THE USE OF FORCE INCIDENCES
## 3                                                                           ENGAGED IN RELATIONSHIP W/INMATES/PAROLEES HAVING OVER 500 TELEPHONE CALLS STATEING CONVERSATIONS NOT IN FURTHERANCE OF WORK RELATED OBJECTIVES & W/O PERMISSION TO DO SO FROM DOCCS.
## 4                                                                            FAILED TO DETECT AND/OR TAK ANY ACTION REGARDING AN INMATE ENGAGED IN IMPROPER ACTIVITIES (REMOVING A FEEDING TUBE FROM HIS NOSE), FAILED TO TAKE ACTION TO ADDRESS ANOTHER EMPLOYEE
## 5 REFUSED DIRECT ORDER FROM SGT. TO ASSUME DUTIES OF C-3 DIVISION WHILE YELLING REFUSAL TO SGT. REFUSED DIRECT ORDER FROM LT. TO ASSUME DUTIES WHILE YELLING REFUSAL TO LT. REFUSED DIRECT ORDER FROM SGT. TO ASSUME DUTIES OF RMHU ESCORTWHILE YELLING REFUSAL T
## 6                                                                                                                                                                             INSUBORDINATE REFUSED TO COMMUNICATE PROPERLY WHEN ASKED IF INFORMATION WAS CORRECT
##     DISPODT DISPO                   PENALTYDIS ARBITRATOR PERCLOSED page_num
## 1 7/28/2020     S              $2,000, 12M DEP                    Y        1
## 2 4/30/2021     R RESIGNING EFFECTINVE 4/30/21                    Y        1
## 3 10/1/2020     S               3M SP, 12M DEP                    Y        1
## 4  7/7/2020     S               3W SP, 12M DEP                    Y        1
## 5 7/24/2020     S                        3W SP                    Y        1
## 6 10/7/2020     S           1W SP, 5D, 18M DEP                    Y        1

Let’s start by counting all the unique combinations of first and last name

unique_names <- employee_discipline %>%
  group_by(LNAME, FNAME) %>%
  summarize(count = n())
## `summarise()` has grouped output by 'LNAME'. You can override using the
## `.groups` argument.
n_unique_combinations <- nrow(unique_names)
print(n_unique_combinations)
## [1] 953

There is a total of 953 unique combinations which means there are 953 employees included in this data set. Let’s format the data frame in a tidy format.

#First I will create a new column that represents first and last names and will get rid 
# of the separate first and last name columns. 

employee_discipline <- employee_discipline %>%
  mutate(FULL_NAME = paste(LNAME, FNAME)) %>%
  select(FULL_NAME, everything(), -LNAME, -FNAME)


#Let's run distinct one more time on the composed FULL_NAME variable 


length(unique(employee_discipline$FULL_NAME))
## [1] 953
#We again confirmed that there are 953 unique observations/employees in this data frame

Let’s perform some data cleaning

#First remove the columns that we are not interested in


employee_discipline <-  employee_discipline %>%
  select(-c(UNION, CLOSED, REASON, Field0, DISPODT, DISPO, PENALTYDIS, ARBITRATOR, PERCLOSED, page_num  ))

#Convert the other columns to factor

employee_discipline <- employee_discipline %>%
  mutate_at(vars(FACILITY, TITLE, MISCONDUCT, PENALTY), as.factor)


#Next let's create a unique identifier for each Employee

employee_discipline <- employee_discipline %>%
  group_by(FULL_NAME) %>%
  mutate(employee_id = cur_group_id()) %>%
  ungroup()

#Make sure there are 953 employee ids 

length(unique(employee_discipline$employee_id))
## [1] 953
head(employee_discipline)
## # A tibble: 6 × 6
##   FULL_NAME             TITLE   FACILITY      MISCONDUCT PENALTY     employee_id
##   <chr>                 <fct>   <fct>         <fct>      <fct>             <int>
## 1 STEVENSON SYLBERN     CO      EDGECOMBE     FD, NG     DISMISSAL …         853
## 2 BODDEN MICHAEL        CO      GREEN HAVEN   IA, FD     DISMISSAL …         132
## 3 GIVANS-DANIELS GLENDA CO      BEDFORD HILLS DI         DISMISSAL …         395
## 4 ROUANTREE KEITH       CO      GREEN HAVEN   FD, IA, NG DISMISSAL …         792
## 5 MELENDEZ IRIS         CO      COXSACKIE     IN         DISMISSAL …         629
## 6 BUSH JONATHAN         ASAT-PA ALBION        IN         30D SP              179
head(employee_discipline)
## # A tibble: 6 × 6
##   FULL_NAME             TITLE   FACILITY      MISCONDUCT PENALTY     employee_id
##   <chr>                 <fct>   <fct>         <fct>      <fct>             <int>
## 1 STEVENSON SYLBERN     CO      EDGECOMBE     FD, NG     DISMISSAL …         853
## 2 BODDEN MICHAEL        CO      GREEN HAVEN   IA, FD     DISMISSAL …         132
## 3 GIVANS-DANIELS GLENDA CO      BEDFORD HILLS DI         DISMISSAL …         395
## 4 ROUANTREE KEITH       CO      GREEN HAVEN   FD, IA, NG DISMISSAL …         792
## 5 MELENDEZ IRIS         CO      COXSACKIE     IN         DISMISSAL …         629
## 6 BUSH JONATHAN         ASAT-PA ALBION        IN         30D SP              179

Next let’s convert this data frame into a wide tidy format