PROJECT 2 DATA 607 MSDS

Preparing different datasets for downstream analysis work.

#——————————————————————————————————-

Ron Balaban

Possibly needed libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

library(tidyr)
library(dplyr)
library(ggplot2)

Dataset 1; AirBnB

(Found here: https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data)

airbnb_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/AB_NYC_2019.csv"
airbnb_raw <- read.csv(airbnb_url, header = TRUE, stringsAsFactors = FALSE)
head(airbnb_raw)

##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0                            NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129

Transform to dataframe

airbnb_df <- as.data.frame(airbnb_raw)  
airbnb_df <- subset(airbnb_df, price > 0) # Remove those with price missing
head(airbnb_df)

##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0                            NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129

Prices overall

ggplot(airbnb_df, aes(x = price)) +
  geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +  # A solid scale like above seems to miss some data
  geom_vline(xintercept = median(airbnb_df$price), color = "red", linewidth = 1, linetype = "solid") +
  geom_vline(xintercept = mean(airbnb_df$price), color = "darkgreen", linewidth = 1, linetype = "solid") +
  annotate("text", x = 500, y = 2500, label = "Median Price =  $106", color = "red", size = 5) +
  annotate("text", x = 500, y = 3500, label = "Mean Price =  $163", color = "darkgreen", size = 5) +
  labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices")

## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

We can see that the majority of the data for AirBnB lies within price range of 0 to 300, with a few outliers. Will revisit the outliers later.

Frequency of Prices per neighborhood

# Get Mean and Median prices for each neighborhood
price_neighborhood_group <- airbnb_df %>% 
  group_by(neighbourhood_group) %>% 
  dplyr::summarise(median_price = round(median(price),0),
                   mean_price   = round(mean(price),0))


# Plot- a more detailed breakdown of the prior graph.
ggplot(airbnb_df, aes(x = price)) +
  geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +  
  labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices per Neighborhood") +
  facet_wrap(~neighbourhood_group) +
  geom_text(data = price_neighborhood_group, y = 3000, aes(x = 500, label = paste("Mean Price = $", mean_price)), color = "darkgreen", size = 3) +
  geom_text(data = price_neighborhood_group, y = 2500, aes(x = 500, label = paste("Median Price = $", median_price)), color = "red", size = 3)

## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 10 rows containing missing values (`geom_bar()`).

Price breakdown for AirBnB rentals per neighborhood. As expected, the majority of the data is contained within Brooklyn and Manhattan.

Price versus reviews

ggplot(airbnb_df, aes(x = number_of_reviews, y = price, color = room_type)) +
  #geom_point(alpha = 0.5, color = "blue") +
  geom_point() +
  facet_wrap(~neighbourhood_group, scales = "free") +
  scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
  #scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
  labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews")

Price versus reviews below 1000$

airbnb_df_below1000 <- airbnb_df %>% 
  filter(price < 1000)

ggplot(airbnb_df_below1000, aes(x = number_of_reviews, y = price, color = room_type)) +
  #geom_point(alpha = 0.5, color = "blue") +
  geom_point() +
  facet_wrap(~neighbourhood_group, scales = "free") +
  scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
  #scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
  labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB below 1000$")

#-------------------------------------------------------------------------------
# Looking at the outliers for price vs reviews
airbnb_manyreviews_highprice <- airbnb_df %>% 
  filter(number_of_reviews > 400 & price > 500)

ggplot(airbnb_manyreviews_highprice, aes(x = number_of_reviews, y = price)) +
  #geom_point(alpha = 0.5, color = "blue") +
  geom_point() +
  geom_text(label = airbnb_manyreviews_highprice$name) +
  facet_wrap(~neighbourhood_group, scales = "free") +
  scale_x_continuous(limits = c(446, 448), breaks = seq(446,448, by = 1)) +
  scale_y_continuous(limits = c(574, 576), breaks = seq(574,576, by = 1)) +
  labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB over 1000$")

Here, we have a better view of the data. It seems that the 2 fields don’t really have a solid correlation on each other, as there are many rentals with a low price, but few reviews. There are also many rentals with a low price, but many reviews. However, what is obvious, is that as the price increases, there will generally be less reviews as fewer customers will want to rent the unit, and hence there will be less reviews.

Interestingly enough, there is an AirBnB in Manhattan that has ~450 reviews and costs ~600. Looking into why this one AirBnB stands out so much, it becomes obvious; it’s a beautiful 2500 sq. ft. apartment with its own elevator, which out of curiosity I found on Google Maps;

AirBnB types per neighborhood

airbnb_neighborhood_homes <-airbnb_df %>%
  group_by(neighbourhood_group) %>%
  count(room_type)

airbnb_neighborhood_homes

## # A tibble: 15 × 3
## # Groups:   neighbourhood_group [5]
##    neighbourhood_group room_type           n
##    <chr>               <chr>           <int>
##  1 Bronx               Entire home/apt   379
##  2 Bronx               Private room      651
##  3 Bronx               Shared room        60
##  4 Brooklyn            Entire home/apt  9558
##  5 Brooklyn            Private room    10126
##  6 Brooklyn            Shared room       411
##  7 Manhattan           Entire home/apt 13198
##  8 Manhattan           Private room     7982
##  9 Manhattan           Shared room       480
## 10 Queens              Entire home/apt  2096
## 11 Queens              Private room     3372
## 12 Queens              Shared room       198
## 13 Staten Island       Entire home/apt   176
## 14 Staten Island       Private room      188
## 15 Staten Island       Shared room         9

# Types of Airbnb's in each neighborhood
ggplot(airbnb_neighborhood_homes, aes(x = neighbourhood_group, y = n, fill = room_type)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Types of Airbnb's", x = "Neighbourhood", y = "Count") +
  coord_flip()

Types of Airbnb’s in each Neighborhood

ggplot(data = airbnb_df) + 
  geom_bar(mapping = aes(x=room_type, fill=neighbourhood_group), position = "dodge") +
  labs(title = "AirBnB Listings per Neighborhood ", x = "Listings", y = "Count") +
  coord_flip()

AirBnb prices in each neighborhood

# By neighborhood
airbnb_neighborhood_prices <-airbnb_df %>%
  group_by(neighbourhood_group, neighbourhood) %>% 
  summarize(avg_price = mean(price),
            min_price = min(price), 
            max_price = max(price))

## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.

ggplot(data = airbnb_neighborhood_prices, aes(x = neighbourhood, y = avg_price, color = neighbourhood)) + 
  geom_point() +
  labs(title = " Average Prices per neighborhood") +
  xlab("Neighborhood") + ylab("Average Price") + 
  theme(axis.text.x=element_blank()) + # axis.text.x=element_blank() removes all neighborhood names
  theme(legend.position = "none") +
  facet_wrap(~neighbourhood_group)

The majority of the more expensive places are in Manhattan, with some very odd outliers with a price over 400 in the other boroughs of NYC. This is because of some ridiculous outliers that cost more than 2500, as seen in ‘Price versus reviews’

outliers <- airbnb_df %>% 
  filter(price >= 2500)

outliers  %>% 
  group_by(neighbourhood_group, neighbourhood) %>% 
  arrange(price)

## # A tibble: 77 × 16
## # Groups:   neighbourhood_group, neighbourhood [36]
##          id name    host_id host_name neighbourhood_group neighbourhood latitude
##       <int> <chr>     <int> <chr>     <chr>               <chr>            <dbl>
##  1   893413 "Archi…  4.75e6 Martin    Manhattan           East Village      40.7
##  2  2276383 "Penth…  1.16e7 Mike      Manhattan           Greenwich Vi…     40.7
##  3 12339863 "Loft"   1.00e7 Claudine  Manhattan           Tribeca           40.7
##  4 14408114 "Unpar…  8.36e5 Henry     Manhattan           Midtown           40.8
##  5 19554980 "A Coz…  9.78e7 Logan     Manhattan           Upper West S…     40.8
##  6 19698169 "\"The…  1.32e8 Kathy     Bronx               Riverdale         40.9
##  7 23373090 "SHOOT…  1.18e7 V         Brooklyn            Williamsburg      40.7
##  8 25018204 "Parad…  1.73e8 Rasmus    Manhattan           Harlem            40.8
##  9 31470004 "Priva…  7.12e7 Max       Manhattan           East Village      40.7
## 10 34592851 "Beaut…  1.99e7 Cheryl    Brooklyn            Crown Heights     40.7
## # ℹ 67 more rows
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <int>,
## #   minimum_nights <int>, number_of_reviews <int>, last_review <chr>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <int>,
## #   availability_365 <int>

ggplot(data = outliers, aes(x = price, fill = neighbourhood_group)) + 
  geom_histogram(bins= 100) +
  labs(title = "Outlier Prices per neighborhood (2500$ +)") +
  xlab("Price") + ylab("Frequency") + 
  scale_x_continuous(limits = c(2000, 10500), breaks = seq(2000, 10000, by = 1000)) +
  scale_y_continuous(limits = c(0, 12), breaks = seq(0, 12, by = 2))

## Warning: Removed 10 rows containing missing values (`geom_bar()`).

These outliers are what causes the increase in the mean prices.

AirBnB listings in Brooklyn & Manhattan

airbnb_Brooklyn <- airbnb_df%>%
  filter(neighbourhood_group == "Brooklyn")

ggplot(airbnb_Brooklyn, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Listings in Brooklyn", x = "Listings", y = "Neighbourhood")

#-------------------------------------------------------------------------------
airbnb_Manhattan <- airbnb_df%>%
  filter(neighbourhood_group == "Manhattan")

ggplot(airbnb_Manhattan, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Listings in Manhattan", x = "Listings", y = "Neighbourhood")

Dataset 2; Spotify

(Found here: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023)

spotify_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/spotify-2023.csv"
spotify_raw <- read.csv(spotify_url, header = TRUE, stringsAsFactors = FALSE)

# Make into dataframe
spotify_df <- as.data.frame(spotify_raw)  
head(spotify_df)

##                            track_name    artist.s._name artist_count
## 1 Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook            2
## 2                                LALA       Myke Towers            1
## 3                             vampire    Olivia Rodrigo            1
## 4                        Cruel Summer      Taylor Swift            1
## 5                      WHERE SHE GOES         Bad Bunny            1
## 6                            Sprinter Dave, Central Cee            2
##   released_year released_month released_day in_spotify_playlists
## 1          2023              7           14                  553
## 2          2023              3           23                 1474
## 3          2023              6           30                 1397
## 4          2019              8           23                 7858
## 5          2023              5           18                 3133
## 6          2023              6            1                 2186
##   in_spotify_charts   streams in_apple_playlists in_apple_charts
## 1               147 141381703                 43             263
## 2                48 133716286                 48             126
## 3               113 140003974                 94             207
## 4               100 800840817                116             207
## 5                50 303236322                 84             133
## 6                91 183706234                 67             213
##   in_deezer_playlists in_deezer_charts in_shazam_charts bpm key  mode
## 1                  45               10              826 125   B Major
## 2                  58               14              382  92  C# Major
## 3                  91               14              949 138   F Major
## 4                 125               12              548 170   A Major
## 5                  87               15              425 144   A Minor
## 6                  88               17              946 141  C# Major
##   danceability_. valence_. energy_. acousticness_. instrumentalness_.
## 1             80        89       83             31                  0
## 2             71        61       74              7                  0
## 3             51        32       53             17                  0
## 4             55        58       72             11                  0
## 5             65        23       80             14                 63
## 6             92        66       58             19                  0
##   liveness_. speechiness_.
## 1          8             4
## 2         10             4
## 3         31             6
## 4         11            15
## 5         11             6
## 6          8            24

This dataset contains a comprehensive list of the most famous songs of 2023 as listed on Spotify. The dataset offers a wealth of features beyond what is typically available in similar datasets. It provides insights into each song’s attributes, popularity, and presence on various music platforms. The dataset includes information such as track name, artist(s) name, release date, Spotify playlists and charts, streaming statistics, Apple Music presence, Deezer presence, Shazam charts, and various audio features.

Removing messy data

# The data has lots of odd � errors (replacement character; https://charbase.com/fffd-unicode-replacement-character), mostly for  Hispanic  accents. 
library(stringi)

# Convert all non-ASCII characters to spaces in the song name
spotify_df[,1] <- stringi::stri_trans_general(spotify_df[,1], "Latin-ASCII")
spotify_df[,1] <- iconv(spotify_df[,1], to = "ASCII", sub = "")

# Convert all non-ASCII characters to spaces in the artist's name
spotify_df[,2] <- stringi::stri_trans_general(spotify_df[,2], "Latin-ASCII")
spotify_df[,2] <- iconv(spotify_df[,2], to = "ASCII", sub = "")

Most popular songs based on Spotify Playlists

most_popular_songs_spotifylists<- spotify_df %>% 
  group_by(released_year, released_month, track_name) %>% 
    summarise(Spotify_Playlist_Count = sum(in_spotify_playlists)) %>% 
        arrange(desc(Spotify_Playlist_Count))

## `summarise()` has grouped output by 'released_year', 'released_month'. You can
## override using the `.groups` argument.

head(most_popular_songs_spotifylists)

## # A tibble: 6 × 4
## # Groups:   released_year, released_month [5]
##   released_year released_month track_name                 Spotify_Playlist_Count
##           <int>          <int> <chr>                                       <int>
## 1          2013              1 Get Lucky - Radio Edit                      52898
## 2          2003              9 Mr. Brightside                              51979
## 3          2013              1 Wake Me Up - Radio Edit                     50887
## 4          1991              9 Smells Like Teen Spirit -…                  49991
## 5          1984             10 Take On Me                                  44927
## 6          2019             11 Blinding Lights                             43899

Most popular songs based on streams

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(formattable)

## 
## Attaching package: 'formattable'

## The following objects are masked from 'package:scales':
## 
##     comma, percent, scientific

# Streams column not numeric, change that
spotify_df$streams <- as.numeric(spotify_df$streams)

## Warning: NAs introduced by coercion

# For the one with no value, just use the average amount
spotify_df[575,9] <- colMeans(spotify_df[ ,9, drop =FALSE], na.rm = TRUE)

# Now that streams is a numeric field, we can get the total plays for each song
most_popular_songs_streams<- spotify_df %>% 
  group_by(released_year, released_month, track_name) %>% 
    summarise(Stream_Plays = sum(streams)) %>% 
        arrange(desc(Stream_Plays))

## `summarise()` has grouped output by 'released_year', 'released_month'. You can
## override using the `.groups` argument.

# Just used to make the number legible
my_comma <- scales::label_comma(accuracy = .1, big.mark = ",", decimal.mark = ".")
most_popular_songs_streams$Stream_Plays <- my_comma(most_popular_songs_streams$Stream_Plays)

# Output
head(most_popular_songs_streams)

## # A tibble: 6 × 4
## # Groups:   released_year, released_month [6]
##   released_year released_month track_name                           Stream_Plays
##           <int>          <int> <chr>                                <chr>       
## 1          2019             11 Blinding Lights                      3,703,895,0…
## 2          2017              1 Shape of You                         3,562,543,8…
## 3          2018             11 Someone You Loved                    2,887,241,8…
## 4          2019              5 Dance Monkey                         2,864,791,6…
## 5          2018             10 Sunflower - Spider-Man: Into the Sp… 2,808,096,5…
## 6          2016              4 One Dance                            2,713,922,3…

So we now have the most popular songs based on the number of times they were streamed and the number of playlists they’re in.

Dataset 3; Diabetes

(Found here: https://www.kaggle.com/datasets/aemyjutt/diabetesdataanslysis)

diabetes_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/diabetes.csv"
diabetes_raw <- read.csv(diabetes_url, header = TRUE, stringsAsFactors = FALSE)

# Make into dataframe
diabetes_df <- as.data.frame(diabetes_raw)  
head(diabetes_df)

##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

Change 0/1 Diabetes diagnosis to Negative/Positive diagnosis

diabetes_df <- diabetes_df %>% 
   mutate(Outcome = ifelse(Outcome == '1', "Positive", "Negative"))

head(diabetes_df)

##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age  Outcome
## 1                    0.627  50 Positive
## 2                    0.351  31 Negative
## 3                    0.672  32 Positive
## 4                    0.167  21 Negative
## 5                    2.288  33 Positive
## 6                    0.201  30 Negative

Address missing values (Except for Pregnancy)

diabetes_df <-  diabetes_df %>%
  #mutate(across(.cols = Glucose:Age, .fns = ~ifelse(.x == 0, round(mean(.x), digits = 0), .x)))
  mutate(across(.cols = Glucose:Age, ~ replace(., . == 0, round(mean(., na.rm = TRUE),0))))
# Didn't mutate to zero-values as it could be men.

head(diabetes_df)

##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35      80 33.6
## 2           1      85            66            29      80 26.6
## 3           8     183            64            21      80 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74            21      80 25.6
##   DiabetesPedigreeFunction Age  Outcome
## 1                    0.627  50 Positive
## 2                    0.351  31 Negative
## 3                    0.672  32 Positive
## 4                    0.167  21 Negative
## 5                    2.288  33 Positive
## 6                    0.201  30 Negative

Basic histograms vs Diabetes Diagnosis

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:formattable':
## 
##     style

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

g_preg <- ggplot(diabetes_df, aes(x=Pregnancies, fill = Outcome)) +
  geom_histogram(binwidth = 0.5) + 
  theme(legend.position = "none")

g_glucose <- ggplot(diabetes_df, aes(x=Glucose, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_bp <-ggplot(diabetes_df, aes(x=BloodPressure, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_skin <-ggplot(diabetes_df, aes(x=SkinThickness, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_insulin <-ggplot(diabetes_df, aes(x=Insulin, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_bmi <-ggplot(diabetes_df, aes(x=BMI, fill = Outcome)) +
   geom_histogram(binwidth = 1) + 
   theme(legend.position = "none")

g_dpf <-ggplot(diabetes_df, aes(x=DiabetesPedigreeFunction, fill = Outcome)) +
   geom_histogram(binwidth = 0.05) + 
   theme(legend.position = "none")

g_age <-ggplot(diabetes_df, aes(x=Age, fill = Outcome)) +
   geom_histogram() + 
   theme(legend.position = "none")

g_diabetes <-ggplot(diabetes_df, aes(x=Outcome, fill = Outcome)) +
   geom_histogram(stat="count")

## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

#-------------------------------------------------------------------------------
library(patchwork)

## 
## Attaching package: 'patchwork'

## The following object is masked from 'package:formattable':
## 
##     area

# https://patchwork.data-imaginist.com/

(g_preg + g_glucose + g_bp ) /
(g_skin + g_insulin + g_bmi) /
(g_dpf  + g_age     + g_diabetes)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What is immediately evident after looking at these distributions, is that you see higher rates of positive diabetes in people with higher glucose, as there doesn’t seem to be a strong correlation between the other fields. I attempted to find the correlation between the other 8 fields with the diabetes outcome, but seeing as it is a character, I’d need to use logistic regression to measure it compared to the numeric features, by having the categorical feature as a target and numeric as an input. Alternatively, use the Chi-Square test and ANOVA (Analysis of Variance) (https://datascience.stackexchange.com/questions/893/how-to-get-correlation-between-two-categorical-variable-and-a-categorical-variab)

Attempt below, but was unsuccessful.

#categorical_var <- factor(c("Positive", "Negative"))
# Convert the categorical variable to a binary variable
#binary_var <- as.numeric(categorical_var) - 1
# Calculate point-biserial correlation
#correlation <- cor(binary_var, diabetes_df$Pregnancies)
#-------------------------------------------------------------------------------
# ANOVA attempt
#anova_result <- aov(diabetes_df$Pregnancies ~ categorical_var)

#-------------------------------------------------------------------------------
# Base correlations won't work here.
#cor(diabetes_df$Outcome, diabetes_df$Pregnancies)
#cor(diabetes_df$Outcome, diabetes_df$Glucose)
#cor(diabetes_df$Outcome, diabetes_df$BloodPressure)
#cor(diabetes_df$Outcome, diabetes_df$SkinThickness)
#cor(diabetes_df$Outcome, diabetes_df$Insulin)
#cor(diabetes_df$Outcome, diabetes_df$BMI)
#cor(diabetes_df$Outcome, diabetes_df$DiabetesPedigreeFunction)
#cor(diabetes_df$Outcome, diabetes_df$Age)

Those who have diabetes (268/ 768)

diabetes_positive <- diabetes_df %>%
  filter(Outcome == "Positive")

diabetes_positive  %>%
  group_by(Glucose, Age, BloodPressure) %>% 
        arrange(desc(Glucose))

## # A tibble: 268 × 9
## # Groups:   Glucose, Age, BloodPressure [268]
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <int>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           1     199            76            43      80  42.9
##  2           0     198            66            32     274  41.3
##  3           2     197            70            45     543  30.5
##  4           8     197            74            21      80  25.9
##  5           2     197            70            99      80  34.7
##  6           7     196            90            21      80  39.8
##  7           8     196            76            29     280  37.5
##  8           1     196            76            36     249  36.5
##  9           7     195            70            33     145  25.1
## 10           6     195            70            21      80  30.9
## # ℹ 258 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <chr>

#——————————————————————————————————- # Folorunsho Atanda

Load required library

library(tidyverse)
library(RMySQL)

## Loading required package: DBI

library(dotenv)

Create database connection

dotenv::load_dot_env(file = "sql_pw.env")
my_sql_pw <- Sys.getenv("MYSQL_PW")

db <- dbConnect(
  MySQL(),
  user = "root",
  #password = my_sql_pw,
  password = "0830spscuny2023!!!",
  dbname = "project_2",
  host = "localhost",
  port = 3306
)

Load table from database as a data frame

diabetes <- db %>% 
  dbGetQuery("select * from project_2.diabetes")

diabetes_df <- as_tibble(diabetes)

# Disconnect database
db_status <- dbDisconnect(db)

Because 0 can’t exist in the columns Glucose, BloodPressure, SkinThickness, Insulin, BMI, DPF and Age, going to change does to NA. Then also change 0 to Negative and 1 to Positive in Outcome

columns_to_modify <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DPF", "Age")

diabetes_df <- diabetes_df %>% 
  mutate(across(all_of(columns_to_modify), ~if_else(. == 0, NA, .))) %>% 
  mutate("Outcome" = ifelse(`Outcome` == 0, "Negative", "Positive"))

Summarise data

glimpse(diabetes_df)

## Rows: 768
## Columns: 10
## $ ID            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ Pregnancies   <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7,…
## $ Glucose       <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168,…
## $ BloodPressure <int> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, …
## $ SkinThickness <int> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, …
## $ Insulin       <int> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 84…
## $ BMI           <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA…
## $ DPF           <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, …
## $ Age           <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, …
## $ Outcome       <chr> "Positive", "Negative", "Positive", "Negative", "Positiv…

summary(diabetes_df)

##        ID         Pregnancies        Glucose      BloodPressure   
##  Min.   :  1.0   Min.   : 0.000   Min.   : 44.0   Min.   : 24.00  
##  1st Qu.:192.8   1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00  
##  Median :384.5   Median : 3.000   Median :117.0   Median : 72.00  
##  Mean   :384.5   Mean   : 3.845   Mean   :121.7   Mean   : 72.41  
##  3rd Qu.:576.2   3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00  
##  Max.   :768.0   Max.   :17.000   Max.   :199.0   Max.   :122.00  
##                                   NA's   :5       NA's   :35      
##  SkinThickness      Insulin            BMI             DPF        
##  Min.   : 7.00   Min.   : 14.00   Min.   :18.20   Min.   :0.0780  
##  1st Qu.:22.00   1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437  
##  Median :29.00   Median :125.00   Median :32.30   Median :0.3725  
##  Mean   :29.15   Mean   :155.55   Mean   :32.46   Mean   :0.4719  
##  3rd Qu.:36.00   3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262  
##  Max.   :99.00   Max.   :846.00   Max.   :67.10   Max.   :2.4200  
##  NA's   :227     NA's   :374      NA's   :11                      
##       Age          Outcome         
##  Min.   :21.00   Length:768        
##  1st Qu.:24.00   Class :character  
##  Median :29.00   Mode  :character  
##  Mean   :33.24                     
##  3rd Qu.:41.00                     
##  Max.   :81.00                     
##

Compare Variables

Pregnancies vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Pregnancies`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge", 
    na.rm = TRUE) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Pregnancies`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    ) + 
  labs(title = "Pregnancies vs Outcome")

diabetes_df %>% 
  select(c(`Pregnancies`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Pregnancies_mean = mean(`Pregnancies`), 
    Pregnancies_median = median(`Pregnancies`),
    Pregnancies_Q1 = quantile(`Pregnancies`, probs = 0.25),
    Pregnancies_Q2 = quantile(`Pregnancies`, probs = 0.75)
    )

## # A tibble: 2 × 5
##   Outcome  Pregnancies_mean Pregnancies_median Pregnancies_Q1 Pregnancies_Q2
##   <chr>               <dbl>              <dbl>          <dbl>          <dbl>
## 1 Negative             3.30                  2           1                 5
## 2 Positive             4.87                  4           1.75              8

Analysis: The distribution of Pregnancies to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

Glucose vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Glucose`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Glucose`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    ) + 
  labs(title = "Glucose vs Outcome")

diabetes_df %>% 
  select(c(`Glucose`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Glucose_mean = mean(`Glucose`, na.rm = TRUE), 
    Glucose_median = median(`Glucose`, na.rm = TRUE),
    Glucose_Q1 = quantile(`Glucose`, probs = 0.25, na.rm = TRUE),
    Glucose_Q2 = quantile(`Glucose`, probs = 0.75, na.rm = TRUE)
    )

## # A tibble: 2 × 5
##   Outcome  Glucose_mean Glucose_median Glucose_Q1 Glucose_Q2
##   <chr>           <dbl>          <dbl>      <dbl>      <dbl>
## 1 Negative         111.            107         93        125
## 2 Positive         142.            140        119        167

Analysis: The distribution of Glucose to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 93 and 167.

Blood Pressure vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `BloodPressure`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `BloodPressure`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    )

diabetes_df %>% 
  select(c(`BloodPressure`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    BloodPressure_mean = mean(`BloodPressure`, na.rm = TRUE), 
    BloodPressure_median = median(`BloodPressure`, na.rm = TRUE),
    BloodPressure_Q1 = quantile(`BloodPressure`, probs = 0.25, na.rm = TRUE),
    BloodPressure_Q2 = quantile(`BloodPressure`, probs = 0.75, na.rm = TRUE)
    )

## # A tibble: 2 × 5
##   Outcome  BloodPressure_mean BloodPressure_median BloodPressure_Q1
##   <chr>                 <dbl>                <dbl>            <dbl>
## 1 Negative               70.9                 70                 62
## 2 Positive               75.3                 74.5               68
## # ℹ 1 more variable: BloodPressure_Q2 <dbl>

Analysis: The distribution of BloodPressure to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 62 and 84.

Skin Thickness vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `SkinThickness`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `SkinThickness`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    )

diabetes_df %>% 
  select(c(`SkinThickness`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    SkinThickness_mean = mean(`SkinThickness`, na.rm = TRUE), 
    SkinThickness_median = median(`SkinThickness`, na.rm = TRUE),
    SkinThickness_Q1 = quantile(`SkinThickness`, probs = 0.25, na.rm = TRUE),
    SkinThickness_Q2 = quantile(`SkinThickness`, probs = 0.75, na.rm = TRUE)
    )

## # A tibble: 2 × 5
##   Outcome  SkinThickness_mean SkinThickness_median SkinThickness_Q1
##   <chr>                 <dbl>                <dbl>            <dbl>
## 1 Negative               27.2                   27               19
## 2 Positive               33                     32               27
## # ℹ 1 more variable: SkinThickness_Q2 <dbl>

Analysis: The distribution of SkinThickness to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 19 and 39.

Insulin vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Insulin`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Insulin`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    )

diabetes_df %>% 
  select(c(`Insulin`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Insulin_mean = mean(`Insulin`, na.rm = TRUE), 
    Insulin_median = median(`Insulin`, na.rm = TRUE),
    Insulin_Q1 = quantile(`Insulin`, probs = 0.25, na.rm = TRUE),
    Insulin_Q2 = quantile(`Insulin`, probs = 0.75, na.rm = TRUE)
    )

## # A tibble: 2 × 5
##   Outcome  Insulin_mean Insulin_median Insulin_Q1 Insulin_Q2
##   <chr>           <dbl>          <dbl>      <dbl>      <dbl>
## 1 Negative         130.           102.        66        161.
## 2 Positive         207.           170.       128.       239.

Analysis: The distribution of Insulin to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

BMI vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `BMI`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `BMI`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    )

diabetes_df %>% 
  select(c(`BMI`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    BMI_mean = mean(`BMI`, na.rm = TRUE), 
    BMI_median = median(`BMI`, na.rm = TRUE),
    BMI_Q1 = quantile(`BMI`, probs = 0.25, na.rm = TRUE),
    BMI_Q2 = quantile(`BMI`, probs = 0.75, na.rm = TRUE)
    )

## # A tibble: 2 × 5
##   Outcome  BMI_mean BMI_median BMI_Q1 BMI_Q2
##   <chr>       <dbl>      <dbl>  <dbl>  <dbl>
## 1 Negative     30.9       30.1   25.6   35.3
## 2 Positive     35.4       34.3   30.9   38.9

Analysis: The distribution of BMI to Outcome is symmetrical. This tells us that the median and mean are very close to one another. Which is confirmed in the box plot and the stat table. From the stat table we can infer that most of the data is between 19 and 39.

DPF vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `DPF`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `DPF`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    )

diabetes_df %>% 
  select(c(`DPF`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    DPF_mean = mean(`DPF`, na.rm = TRUE), 
    DPF_median = median(`DPF`, na.rm = TRUE),
    DPF_Q1 = quantile(`DPF`, probs = 0.25, na.rm = TRUE),
    DPF_Q2 = quantile(`DPF`, probs = 0.75, na.rm = TRUE)
    )

## # A tibble: 2 × 5
##   Outcome  DPF_mean DPF_median DPF_Q1 DPF_Q2
##   <chr>       <dbl>      <dbl>  <dbl>  <dbl>
## 1 Negative    0.430      0.336  0.230  0.562
## 2 Positive    0.550      0.449  0.262  0.728

Analysis: The distribution of DPF to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

Age vs Outcome

diabetes_df %>% 
  ggplot(aes(x = `Age`, fill = `Outcome`)) +
  geom_bar(
    stat = "count", 
    position = "dodge",
    na.rm = TRUE
    ) +
  facet_wrap(~`Outcome`)

diabetes_df %>% 
  ggplot(aes(x = `Outcome`, y = `Age`)) +
  geom_boxplot(
    stat = "boxplot",
    color = c("red","blue"),
    na.rm = TRUE
    )

diabetes_df %>% 
  select(c(`Age`, `Outcome`)) %>% 
  group_by(`Outcome`) %>% 
  summarise(
    Age_mean = mean(`Age`), 
    Age_median = median(`Age`),
    Age_Q1 = quantile(`Age`, probs = 0.25),
    Age_Q2 = quantile(`Age`, probs = 0.75)
    )

## # A tibble: 2 × 5
##   Outcome  Age_mean Age_median Age_Q1 Age_Q2
##   <chr>       <dbl>      <dbl>  <dbl>  <dbl>
## 1 Negative     31.2         27     23     37
## 2 Positive     37.1         36     28     44

Analysis: The distribution of Age to Outcome is right skewed. This tells us that the median and mean are not the same. Which is confirmed in the box plot and the stat table.

Correlation

Because Glucose, BloodPressure, and BMI have normal distribution, I want to see if there is a correlation between them, focusing on a positive diabetes Outcome.

diabetes_cor <- diabetes_df %>% 
  select(c(`Glucose`, `BloodPressure`, `BMI`, `Outcome`)) %>% 
  filter(`Outcome` == "Positive") 

Glucose_BloodPressure_cor <- cor(
  x = as.numeric(diabetes_cor$Glucose),  
  y = as.numeric(diabetes_cor$BloodPressure), 
  use = "na.or.complete",
  method = "pearson"
  )

Glucose_BMI_cor <- cor(
  x = as.numeric(diabetes_cor$Glucose),  
  y = as.numeric(diabetes_cor$BMI), 
  use = "na.or.complete", 
  method = "pearson"
  )

BloodPressure_BMI_cor <- cor(
  x = as.numeric(diabetes_cor$BloodPressure),  
  y = as.numeric(diabetes_cor$BMI), 
  use = "na.or.complete",
  method = "pearson"
  )

correlation_df <- tibble(
  Relationship = c("Glucose vs Blood Pressure", "Glucose vs BMI",
                   "Blood Pressure vs BMI"),
  Correlation = c(Glucose_BloodPressure_cor, Glucose_BMI_cor, 
                  BloodPressure_BMI_cor)
)

correlation_df

## # A tibble: 3 × 2
##   Relationship              Correlation
##   <chr>                           <dbl>
## 1 Glucose vs Blood Pressure      0.101 
## 2 Glucose vs BMI                 0.0566
## 3 Blood Pressure vs BMI          0.249

Analysis: From the table we see that there is no correlation between these three variables.