Dataset 1; AirBnB
airbnb_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/AB_NYC_2019.csv"
airbnb_raw <- read.csv(airbnb_url, header = TRUE, stringsAsFactors = FALSE)
head(airbnb_raw)
## id name host_id host_name
## 1 2539 Clean & quiet apt home by the park 2787 John
## 2 2595 Skylit Midtown Castle 2845 Jennifer
## 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth
## 4 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura
## 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris
## neighbourhood_group neighbourhood latitude longitude room_type price
## 1 Brooklyn Kensington 40.64749 -73.97237 Private room 149
## 2 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225
## 3 Manhattan Harlem 40.80902 -73.94190 Private room 150
## 4 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89
## 5 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80
## 6 Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200
## minimum_nights number_of_reviews last_review reviews_per_month
## 1 1 9 2018-10-19 0.21
## 2 1 45 2019-05-21 0.38
## 3 3 0 NA
## 4 1 270 2019-07-05 4.64
## 5 10 9 2018-11-19 0.10
## 6 3 74 2019-06-22 0.59
## calculated_host_listings_count availability_365
## 1 6 365
## 2 2 355
## 3 1 365
## 4 1 194
## 5 1 0
## 6 1 129
Prices overall
ggplot(airbnb_df, aes(x = price)) +
geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + # A solid scale like above seems to miss some data
geom_vline(xintercept = median(airbnb_df$price), color = "red", linewidth = 1, linetype = "solid") +
geom_vline(xintercept = mean(airbnb_df$price), color = "darkgreen", linewidth = 1, linetype = "solid") +
annotate("text", x = 500, y = 2500, label = "Median Price = $106", color = "red", size = 5) +
annotate("text", x = 500, y = 3500, label = "Mean Price = $163", color = "darkgreen", size = 5) +
labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices")
## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

We can see that the majority of the data for AirBnB lies within
price range of 0 to 300, with a few outliers. Will revisit the outliers
later.
Frequency of Prices per neighborhood
# Get Mean and Median prices for each neighborhood
price_neighborhood_group <- airbnb_df %>%
group_by(neighbourhood_group) %>%
dplyr::summarise(median_price = round(median(price),0),
mean_price = round(mean(price),0))
# Plot- more detailed breakdown of prior graph.
ggplot(airbnb_df, aes(x = price)) +
geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices per Neighborhood") +
facet_wrap(~neighbourhood_group) +
geom_text(data = price_neighborhood_group, y = 3000, aes(x = 500, label = paste("Mean Price = $", mean_price)), color = "darkgreen", size = 3) +
geom_text(data = price_neighborhood_group, y = 2500, aes(x = 500, label = paste("Median Price = $", median_price)), color = "red", size = 3)
## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 10 rows containing missing values (`geom_bar()`).

Price breakdown for AirBnB rentals per neighborhood. As expected,
the majority of the data is contained within Brooklyn and
Manhattan.
Price versus reviews
ggplot(airbnb_df, aes(x = number_of_reviews, y = price, color = room_type)) +
#geom_point(alpha = 0.5, color = "blue") +
geom_point() +
facet_wrap(~neighbourhood_group, scales = "free") +
scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
#scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews")

You can see that a majority of rentals are for people taking the
entire place to themself, as very few would want to share a room or
apartment on their vacation. However, there does not seem to be a strong
correlation between how many reviews a place has and its price, as there
are many AirBnB’s with a low price and few reviews. Let’s look at this
again, but where the plot is denser and we have more data to view- let’s
cap the price at 1000
Price versus reviews below 1000$
airbnb_df_below1000 <- airbnb_df %>%
filter(price < 1000)
ggplot(airbnb_df_below1000, aes(x = number_of_reviews, y = price, color = room_type)) +
#geom_point(alpha = 0.5, color = "blue") +
geom_point() +
facet_wrap(~neighbourhood_group, scales = "free") +
scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
#scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB below 1000$")

#-------------------------------------------------------------------------------
# Looking at the outliers for price vs reviews
airbnb_manyreviews_highprice <- airbnb_df %>%
filter(number_of_reviews > 400 & price > 500)
ggplot(airbnb_manyreviews_highprice, aes(x = number_of_reviews, y = price)) +
#geom_point(alpha = 0.5, color = "blue") +
geom_point() +
geom_text(label = airbnb_manyreviews_highprice$name) +
facet_wrap(~neighbourhood_group, scales = "free") +
scale_x_continuous(limits = c(446, 448), breaks = seq(446,448, by = 1)) +
scale_y_continuous(limits = c(574, 576), breaks = seq(574,576, by = 1)) +
labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB over 1000$")

Here, we have a better view of the data. It seems that the 2 fields
don’t really have a solid correlation on each other, as there are many
rentals with a low price, but few reviews. There are also many rentals
with a low price, but many reviews. However, what is obvious, is that as
the price increases, there will generally be less reviews as fewer
customers will want to rent the unit, and hence there will be less
reviews.
Interestingly enough, there is an AirBnB in Manhattan that has ~450
reviews and costs ~600. Looking into why this one AirBnB stands out so
much, it becomes obvious; it’s a beautiful 2500 sq. ft. apartment with
its own elevator, which out of curiosity I found on Google Maps;
AirBnB types per neighborhood
airbnb_neighborhood_homes <-airbnb_df %>%
group_by(neighbourhood_group) %>%
count(room_type)
airbnb_neighborhood_homes
## # A tibble: 15 × 3
## # Groups: neighbourhood_group [5]
## neighbourhood_group room_type n
## <chr> <chr> <int>
## 1 Bronx Entire home/apt 379
## 2 Bronx Private room 651
## 3 Bronx Shared room 60
## 4 Brooklyn Entire home/apt 9558
## 5 Brooklyn Private room 10126
## 6 Brooklyn Shared room 411
## 7 Manhattan Entire home/apt 13198
## 8 Manhattan Private room 7982
## 9 Manhattan Shared room 480
## 10 Queens Entire home/apt 2096
## 11 Queens Private room 3372
## 12 Queens Shared room 198
## 13 Staten Island Entire home/apt 176
## 14 Staten Island Private room 188
## 15 Staten Island Shared room 9
# Types of AirBnB's in each neighborhood
ggplot(airbnb_neighborhood_homes, aes(x = neighbourhood_group, y = n, fill = room_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Types of AirBnB's", x = "Neighbourhood", y = "Count") +
coord_flip()

Once again, Brooklyn and Manhattan are the majority of rentals, and
the majority are either for the entire place, or having their own room.
In which case, they are probably sharing a place with their friends for
vacation.
Types of AirBnB’s in each Neighborhood
ggplot(data = airbnb_df) +
geom_bar(mapping = aes(x=room_type, fill=neighbourhood_group), position = "dodge") +
labs(title = "AirBnB Listings per Neighborhood ", x = "Listings", y = "Count") +
coord_flip()

AirBnb prices in each neighborhood
# By neighborhood
airbnb_neighborhood_prices <-airbnb_df %>%
group_by(neighbourhood_group, neighbourhood) %>%
summarize(avg_price = mean(price),
min_price = min(price),
max_price = max(price))
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
ggplot(data = airbnb_neighborhood_prices, aes(x = neighbourhood, y = avg_price, color = neighbourhood)) +
geom_point() +
labs(title = " Average Prices per neighborhood") +
xlab("Neighborhood") + ylab("Average Price") +
theme(axis.text.x=element_blank()) + # axis.text.x=element_blank() removes all neighborhood names
theme(legend.position = "none") +
facet_wrap(~neighbourhood_group)

The majority of the more expensive places are in Manhattan, with
some very odd outliers with a price over 400 in the other boroughs of
NYC. This is because of some ridiculous outliers that cost more than
2500, as seen in ‘Price versus reviews’
outliers <- airbnb_df %>%
filter(price >= 2500)
outliers %>%
group_by(neighbourhood_group, neighbourhood) %>%
arrange(price)
## # A tibble: 77 × 16
## # Groups: neighbourhood_group, neighbourhood [36]
## id name host_id host_name neighbourhood_group neighbourhood latitude
## <int> <chr> <int> <chr> <chr> <chr> <dbl>
## 1 893413 "Archi… 4.75e6 Martin Manhattan East Village 40.7
## 2 2276383 "Penth… 1.16e7 Mike Manhattan Greenwich Vi… 40.7
## 3 12339863 "Loft" 1.00e7 Claudine Manhattan Tribeca 40.7
## 4 14408114 "Unpar… 8.36e5 Henry Manhattan Midtown 40.8
## 5 19554980 "A Coz… 9.78e7 Logan Manhattan Upper West S… 40.8
## 6 19698169 "\"The… 1.32e8 Kathy Bronx Riverdale 40.9
## 7 23373090 "SHOOT… 1.18e7 V Brooklyn Williamsburg 40.7
## 8 25018204 "Parad… 1.73e8 Rasmus Manhattan Harlem 40.8
## 9 31470004 "Priva… 7.12e7 Max Manhattan East Village 40.7
## 10 34592851 "Beaut… 1.99e7 Cheryl Brooklyn Crown Heights 40.7
## # ℹ 67 more rows
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <int>,
## # minimum_nights <int>, number_of_reviews <int>, last_review <chr>,
## # reviews_per_month <dbl>, calculated_host_listings_count <int>,
## # availability_365 <int>
ggplot(data = outliers, aes(x = price, fill = neighbourhood_group)) +
geom_histogram(bins= 100) +
labs(title = "Outlier Prices per neighborhood (2500$ +)") +
xlab("Price") + ylab("Frequency") +
scale_x_continuous(limits = c(2000, 10500), breaks = seq(2000, 10000, by = 1000)) +
scale_y_continuous(limits = c(0, 12), breaks = seq(0, 12, by = 2))
## Warning: Removed 10 rows containing missing values (`geom_bar()`).

These outliers are what causes the increase in the mean prices.
AirBnB listings in Brooklyn & Manhattan
airbnb_Brooklyn <- airbnb_df%>%
filter(neighbourhood_group == "Brooklyn")
ggplot(airbnb_Brooklyn, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Listings in Brooklyn", x = "Listings", y = "Neighbourhood")

#-------------------------------------------------------------------------------
airbnb_Manhattan <- airbnb_df%>%
filter(neighbourhood_group == "Manhattan")
ggplot(airbnb_Manhattan, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Listings in Manhattan", x = "Listings", y = "Neighbourhood")

Dataset 2; Spotify
spotify_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/spotify-2023.csv"
spotify_raw <- read.csv(spotify_url, header = TRUE, stringsAsFactors = FALSE)
# Make into dataframe
spotify_df <- as.data.frame(spotify_raw)
head(spotify_df)
## track_name artist.s._name artist_count
## 1 Seven (feat. Latto) (Explicit Ver.) Latto, Jung Kook 2
## 2 LALA Myke Towers 1
## 3 vampire Olivia Rodrigo 1
## 4 Cruel Summer Taylor Swift 1
## 5 WHERE SHE GOES Bad Bunny 1
## 6 Sprinter Dave, Central Cee 2
## released_year released_month released_day in_spotify_playlists
## 1 2023 7 14 553
## 2 2023 3 23 1474
## 3 2023 6 30 1397
## 4 2019 8 23 7858
## 5 2023 5 18 3133
## 6 2023 6 1 2186
## in_spotify_charts streams in_apple_playlists in_apple_charts
## 1 147 141381703 43 263
## 2 48 133716286 48 126
## 3 113 140003974 94 207
## 4 100 800840817 116 207
## 5 50 303236322 84 133
## 6 91 183706234 67 213
## in_deezer_playlists in_deezer_charts in_shazam_charts bpm key mode
## 1 45 10 826 125 B Major
## 2 58 14 382 92 C# Major
## 3 91 14 949 138 F Major
## 4 125 12 548 170 A Major
## 5 87 15 425 144 A Minor
## 6 88 17 946 141 C# Major
## danceability_. valence_. energy_. acousticness_. instrumentalness_.
## 1 80 89 83 31 0
## 2 71 61 74 7 0
## 3 51 32 53 17 0
## 4 55 58 72 11 0
## 5 65 23 80 14 63
## 6 92 66 58 19 0
## liveness_. speechiness_.
## 1 8 4
## 2 10 4
## 3 31 6
## 4 11 15
## 5 11 6
## 6 8 24
This dataset contains a comprehensive list of the most famous songs
of 2023 as listed on Spotify. The dataset offers a wealth of features
beyond what is typically available in similar datasets. It provides
insights into each song’s attributes, popularity, and presence on
various music platforms. The dataset includes information such as track
name, artist(s) name, release date, Spotify playlists and charts,
streaming statistics, Apple Music presence, Deezer presence, Shazam
charts, and various audio features.
Removing messy data
# The data has lots of odd � errors (replacement character; https://charbase.com/fffd-unicode-replacement-character), mostly for Hispanic accents.
library(stringi)
# Convert all non-ASCII characters to spaces in the song name
spotify_df[,1] <- stringi::stri_trans_general(spotify_df[,1], "Latin-ASCII")
spotify_df[,1] <- iconv(spotify_df[,1], to = "ASCII", sub = "")
# Convert all non-ASCII characters to spaces in the artist name
spotify_df[,2] <- stringi::stri_trans_general(spotify_df[,2], "Latin-ASCII")
spotify_df[,2] <- iconv(spotify_df[,2], to = "ASCII", sub = "")
Most popular songs based on Spotify Playlists
most_popular_songs_spotifylists<- spotify_df %>%
group_by(released_year, released_month, track_name) %>%
summarise(Spotify_Playlist_Count = sum(in_spotify_playlists)) %>%
arrange(desc(Spotify_Playlist_Count))
## `summarise()` has grouped output by 'released_year', 'released_month'. You can
## override using the `.groups` argument.
head(most_popular_songs_spotifylists)
## # A tibble: 6 × 4
## # Groups: released_year, released_month [5]
## released_year released_month track_name Spotify_Playlist_Count
## <int> <int> <chr> <int>
## 1 2013 1 Get Lucky - Radio Edit 52898
## 2 2003 9 Mr. Brightside 51979
## 3 2013 1 Wake Me Up - Radio Edit 50887
## 4 1991 9 Smells Like Teen Spirit -… 49991
## 5 1984 10 Take On Me 44927
## 6 2019 11 Blinding Lights 43899
Most popular songs based on streams
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(formattable)
##
## Attaching package: 'formattable'
## The following objects are masked from 'package:scales':
##
## comma, percent, scientific
# Streams column not numeric, change that
spotify_df$streams <- as.numeric(spotify_df$streams)
## Warning: NAs introduced by coercion
# For the one with no value, just use average amount
spotify_df[575,9] <- colMeans(spotify_df[ ,9, drop =FALSE], na.rm = TRUE)
# Now that streams is a numeric field, we can get the total plays for each song
most_popular_songs_streams<- spotify_df %>%
group_by(released_year, released_month, track_name) %>%
summarise(Stream_Plays = sum(streams)) %>%
arrange(desc(Stream_Plays))
## `summarise()` has grouped output by 'released_year', 'released_month'. You can
## override using the `.groups` argument.
# Just used to make the number legible
my_comma <- scales::label_comma(accuracy = .1, big.mark = ",", decimal.mark = ".")
most_popular_songs_streams$Stream_Plays <- my_comma(most_popular_songs_streams$Stream_Plays)
# Output
head(most_popular_songs_streams)
## # A tibble: 6 × 4
## # Groups: released_year, released_month [6]
## released_year released_month track_name Stream_Plays
## <int> <int> <chr> <chr>
## 1 2019 11 Blinding Lights 3,703,895,0…
## 2 2017 1 Shape of You 3,562,543,8…
## 3 2018 11 Someone You Loved 2,887,241,8…
## 4 2019 5 Dance Monkey 2,864,791,6…
## 5 2018 10 Sunflower - Spider-Man: Into the Sp… 2,808,096,5…
## 6 2016 4 One Dance 2,713,922,3…
So we now have the most popular songs based on the number of times
they were streamed, and the number of playlists they’re in.
Dataset 3; Diabetes
diabetes_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/diabetes.csv"
diabetes_raw <- read.csv(diabetes_url, header = TRUE, stringsAsFactors = FALSE)
# Make into dataframe
diabetes_df <- as.data.frame(diabetes_raw)
head(diabetes_df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Change 0/1 Diabetes diagnosis to Negative/Positive diagnosis
diabetes_df <- diabetes_df %>%
mutate(Outcome = ifelse(Outcome == '1', "Positive", "Negative"))
head(diabetes_df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 Positive
## 2 0.351 31 Negative
## 3 0.672 32 Positive
## 4 0.167 21 Negative
## 5 2.288 33 Positive
## 6 0.201 30 Negative
Address missing values (Except for Pregnancy)
diabetes_df <- diabetes_df %>%
#mutate(across(.cols = Glucose:Age, .fns = ~ifelse(.x == 0, round(mean(.x), digits = 0), .x)))
mutate(across(.cols = Glucose:Age, ~ replace(., . == 0, round(mean(., na.rm = TRUE),0))))
# Didn't mutate to zero-values as it could be men.
head(diabetes_df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 80 33.6
## 2 1 85 66 29 80 26.6
## 3 8 183 64 21 80 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 21 80 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 Positive
## 2 0.351 31 Negative
## 3 0.672 32 Positive
## 4 0.167 21 Negative
## 5 2.288 33 Positive
## 6 0.201 30 Negative
Basic histograms vs Diabetes Diagnosis
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:formattable':
##
## style
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
g_preg <- ggplot(diabetes_df, aes(x=Pregnancies, fill = Outcome)) +
geom_histogram(binwidth = 0.5) +
theme(legend.position = "none")
g_glucose <- ggplot(diabetes_df, aes(x=Glucose, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_bp <-ggplot(diabetes_df, aes(x=BloodPressure, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_skin <-ggplot(diabetes_df, aes(x=SkinThickness, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_insulin <-ggplot(diabetes_df, aes(x=Insulin, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_bmi <-ggplot(diabetes_df, aes(x=BMI, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_dpf <-ggplot(diabetes_df, aes(x=DiabetesPedigreeFunction, fill = Outcome)) +
geom_histogram(binwidth = 0.05) +
theme(legend.position = "none")
g_age <-ggplot(diabetes_df, aes(x=Age, fill = Outcome)) +
geom_histogram() +
theme(legend.position = "none")
g_diabetes <-ggplot(diabetes_df, aes(x=Outcome, fill = Outcome)) +
geom_histogram(stat="count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
#-------------------------------------------------------------------------------
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:formattable':
##
## area
# https://patchwork.data-imaginist.com/
(g_preg + g_glucose + g_bp ) /
(g_skin + g_insulin + g_bmi) /
(g_dpf + g_age + g_diabetes)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Attempt below, but was unsuccessful.
#categorical_var <- factor(c("Positive", "Negative"))
# Convert the categorical variable to a binary variable
#binary_var <- as.numeric(categorical_var) - 1
# Calculate point-biserial correlation
#correlation <- cor(binary_var, diabetes_df$Pregnancies)
#-------------------------------------------------------------------------------
# ANOVA attempt
#anova_result <- aov(diabetes_df$Pregnancies ~ categorical_var)
#-------------------------------------------------------------------------------
# Base correlations won't work here.
#cor(diabetes_df$Outcome, diabetes_df$Pregnancies)
#cor(diabetes_df$Outcome, diabetes_df$Glucose)
#cor(diabetes_df$Outcome, diabetes_df$BloodPressure)
#cor(diabetes_df$Outcome, diabetes_df$SkinThickness)
#cor(diabetes_df$Outcome, diabetes_df$Insulin)
#cor(diabetes_df$Outcome, diabetes_df$BMI)
#cor(diabetes_df$Outcome, diabetes_df$DiabetesPedigreeFunction)
#cor(diabetes_df$Outcome, diabetes_df$Age)
Those who have diabetes (268/ 768)
diabetes_positive <- diabetes_df %>%
filter(Outcome == "Positive")
diabetes_positive %>%
group_by(Glucose, Age, BloodPressure) %>%
arrange(desc(Glucose))
## # A tibble: 268 × 9
## # Groups: Glucose, Age, BloodPressure [268]
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 199 76 43 80 42.9
## 2 0 198 66 32 274 41.3
## 3 2 197 70 45 543 30.5
## 4 8 197 74 21 80 25.9
## 5 2 197 70 99 80 34.7
## 6 7 196 90 21 80 39.8
## 7 8 196 76 29 280 37.5
## 8 1 196 76 36 249 36.5
## 9 7 195 70 33 145 25.1
## 10 6 195 70 21 80 30.9
## # ℹ 258 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <chr>