#——————————————————————————————————-
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(tidyr)
library(dplyr)
library(ggplot2)
airbnb_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/AB_NYC_2019.csv"
airbnb_raw <- read.csv(airbnb_url, header = TRUE, stringsAsFactors = FALSE)
head(airbnb_raw)
## id name host_id host_name
## 1 2539 Clean & quiet apt home by the park 2787 John
## 2 2595 Skylit Midtown Castle 2845 Jennifer
## 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth
## 4 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura
## 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris
## neighbourhood_group neighbourhood latitude longitude room_type price
## 1 Brooklyn Kensington 40.64749 -73.97237 Private room 149
## 2 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225
## 3 Manhattan Harlem 40.80902 -73.94190 Private room 150
## 4 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89
## 5 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80
## 6 Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200
## minimum_nights number_of_reviews last_review reviews_per_month
## 1 1 9 2018-10-19 0.21
## 2 1 45 2019-05-21 0.38
## 3 3 0 NA
## 4 1 270 2019-07-05 4.64
## 5 10 9 2018-11-19 0.10
## 6 3 74 2019-06-22 0.59
## calculated_host_listings_count availability_365
## 1 6 365
## 2 2 355
## 3 1 365
## 4 1 194
## 5 1 0
## 6 1 129
airbnb_df <- as.data.frame(airbnb_raw)
airbnb_df <- subset(airbnb_df, price > 0) # Remove those with price missing
head(airbnb_df)
## id name host_id host_name
## 1 2539 Clean & quiet apt home by the park 2787 John
## 2 2595 Skylit Midtown Castle 2845 Jennifer
## 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth
## 4 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura
## 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris
## neighbourhood_group neighbourhood latitude longitude room_type price
## 1 Brooklyn Kensington 40.64749 -73.97237 Private room 149
## 2 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225
## 3 Manhattan Harlem 40.80902 -73.94190 Private room 150
## 4 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89
## 5 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80
## 6 Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200
## minimum_nights number_of_reviews last_review reviews_per_month
## 1 1 9 2018-10-19 0.21
## 2 1 45 2019-05-21 0.38
## 3 3 0 NA
## 4 1 270 2019-07-05 4.64
## 5 10 9 2018-11-19 0.10
## 6 3 74 2019-06-22 0.59
## calculated_host_listings_count availability_365
## 1 6 365
## 2 2 355
## 3 1 365
## 4 1 194
## 5 1 0
## 6 1 129
ggplot(airbnb_df, aes(x = price)) +
geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + # A solid scale like above seems to miss some data
geom_vline(xintercept = median(airbnb_df$price), color = "red", linewidth = 1, linetype = "solid") +
geom_vline(xintercept = mean(airbnb_df$price), color = "darkgreen", linewidth = 1, linetype = "solid") +
annotate("text", x = 500, y = 2500, label = "Median Price = $106", color = "red", size = 5) +
annotate("text", x = 500, y = 3500, label = "Mean Price = $163", color = "darkgreen", size = 5) +
labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices")
## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
# Get Mean and Median prices for each neighborhood
price_neighborhood_group <- airbnb_df %>%
group_by(neighbourhood_group) %>%
dplyr::summarise(median_price = round(median(price),0),
mean_price = round(mean(price),0))
# Plot- a more detailed breakdown of the prior graph.
ggplot(airbnb_df, aes(x = price)) +
geom_histogram(bins = 50, fill = "blue", color = "white", alpha = 0.8) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, by = 100)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(x = "Price", y = "Frequency", title = "NYC AirBnB Prices per Neighborhood") +
facet_wrap(~neighbourhood_group) +
geom_text(data = price_neighborhood_group, y = 3000, aes(x = 500, label = paste("Mean Price = $", mean_price)), color = "darkgreen", size = 3) +
geom_text(data = price_neighborhood_group, y = 2500, aes(x = 500, label = paste("Median Price = $", median_price)), color = "red", size = 3)
## Warning: Removed 239 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 10 rows containing missing values (`geom_bar()`).
ggplot(airbnb_df, aes(x = number_of_reviews, y = price, color = room_type)) +
#geom_point(alpha = 0.5, color = "blue") +
geom_point() +
facet_wrap(~neighbourhood_group, scales = "free") +
scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
#scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews")
airbnb_df_below1000 <- airbnb_df %>%
filter(price < 1000)
ggplot(airbnb_df_below1000, aes(x = number_of_reviews, y = price, color = room_type)) +
#geom_point(alpha = 0.5, color = "blue") +
geom_point() +
facet_wrap(~neighbourhood_group, scales = "free") +
scale_x_continuous(limits = c(0, 650), breaks = seq(0, 650, by = 100)) +
#scale_y_continuous(limits = c(0, 10000), breaks = seq(0, 10000, by = 2500)) +
labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB below 1000$")
#-------------------------------------------------------------------------------
# Looking at the outliers for price vs reviews
airbnb_manyreviews_highprice <- airbnb_df %>%
filter(number_of_reviews > 400 & price > 500)
ggplot(airbnb_manyreviews_highprice, aes(x = number_of_reviews, y = price)) +
#geom_point(alpha = 0.5, color = "blue") +
geom_point() +
geom_text(label = airbnb_manyreviews_highprice$name) +
facet_wrap(~neighbourhood_group, scales = "free") +
scale_x_continuous(limits = c(446, 448), breaks = seq(446,448, by = 1)) +
scale_y_continuous(limits = c(574, 576), breaks = seq(574,576, by = 1)) +
labs(x = "Number of Reviews", y = "Price", title = "Price vs Reviews for AirBnB over 1000$")
airbnb_neighborhood_homes <-airbnb_df %>%
group_by(neighbourhood_group) %>%
count(room_type)
airbnb_neighborhood_homes
## # A tibble: 15 × 3
## # Groups: neighbourhood_group [5]
## neighbourhood_group room_type n
## <chr> <chr> <int>
## 1 Bronx Entire home/apt 379
## 2 Bronx Private room 651
## 3 Bronx Shared room 60
## 4 Brooklyn Entire home/apt 9558
## 5 Brooklyn Private room 10126
## 6 Brooklyn Shared room 411
## 7 Manhattan Entire home/apt 13198
## 8 Manhattan Private room 7982
## 9 Manhattan Shared room 480
## 10 Queens Entire home/apt 2096
## 11 Queens Private room 3372
## 12 Queens Shared room 198
## 13 Staten Island Entire home/apt 176
## 14 Staten Island Private room 188
## 15 Staten Island Shared room 9
# Types of Airbnb's in each neighborhood
ggplot(airbnb_neighborhood_homes, aes(x = neighbourhood_group, y = n, fill = room_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Types of Airbnb's", x = "Neighbourhood", y = "Count") +
coord_flip()
ggplot(data = airbnb_df) +
geom_bar(mapping = aes(x=room_type, fill=neighbourhood_group), position = "dodge") +
labs(title = "AirBnB Listings per Neighborhood ", x = "Listings", y = "Count") +
coord_flip()
# By neighborhood
airbnb_neighborhood_prices <-airbnb_df %>%
group_by(neighbourhood_group, neighbourhood) %>%
summarize(avg_price = mean(price),
min_price = min(price),
max_price = max(price))
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
ggplot(data = airbnb_neighborhood_prices, aes(x = neighbourhood, y = avg_price, color = neighbourhood)) +
geom_point() +
labs(title = " Average Prices per neighborhood") +
xlab("Neighborhood") + ylab("Average Price") +
theme(axis.text.x=element_blank()) + # axis.text.x=element_blank() removes all neighborhood names
theme(legend.position = "none") +
facet_wrap(~neighbourhood_group)
outliers <- airbnb_df %>%
filter(price >= 2500)
outliers %>%
group_by(neighbourhood_group, neighbourhood) %>%
arrange(price)
## # A tibble: 77 × 16
## # Groups: neighbourhood_group, neighbourhood [36]
## id name host_id host_name neighbourhood_group neighbourhood latitude
## <int> <chr> <int> <chr> <chr> <chr> <dbl>
## 1 893413 "Archi… 4.75e6 Martin Manhattan East Village 40.7
## 2 2276383 "Penth… 1.16e7 Mike Manhattan Greenwich Vi… 40.7
## 3 12339863 "Loft" 1.00e7 Claudine Manhattan Tribeca 40.7
## 4 14408114 "Unpar… 8.36e5 Henry Manhattan Midtown 40.8
## 5 19554980 "A Coz… 9.78e7 Logan Manhattan Upper West S… 40.8
## 6 19698169 "\"The… 1.32e8 Kathy Bronx Riverdale 40.9
## 7 23373090 "SHOOT… 1.18e7 V Brooklyn Williamsburg 40.7
## 8 25018204 "Parad… 1.73e8 Rasmus Manhattan Harlem 40.8
## 9 31470004 "Priva… 7.12e7 Max Manhattan East Village 40.7
## 10 34592851 "Beaut… 1.99e7 Cheryl Brooklyn Crown Heights 40.7
## # ℹ 67 more rows
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <int>,
## # minimum_nights <int>, number_of_reviews <int>, last_review <chr>,
## # reviews_per_month <dbl>, calculated_host_listings_count <int>,
## # availability_365 <int>
ggplot(data = outliers, aes(x = price, fill = neighbourhood_group)) +
geom_histogram(bins= 100) +
labs(title = "Outlier Prices per neighborhood (2500$ +)") +
xlab("Price") + ylab("Frequency") +
scale_x_continuous(limits = c(2000, 10500), breaks = seq(2000, 10000, by = 1000)) +
scale_y_continuous(limits = c(0, 12), breaks = seq(0, 12, by = 2))
## Warning: Removed 10 rows containing missing values (`geom_bar()`).
airbnb_Brooklyn <- airbnb_df%>%
filter(neighbourhood_group == "Brooklyn")
ggplot(airbnb_Brooklyn, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Listings in Brooklyn", x = "Listings", y = "Neighbourhood")
#-------------------------------------------------------------------------------
airbnb_Manhattan <- airbnb_df%>%
filter(neighbourhood_group == "Manhattan")
ggplot(airbnb_Manhattan, aes(x = calculated_host_listings_count, y = neighbourhood, fill = room_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Listings in Manhattan", x = "Listings", y = "Neighbourhood")
spotify_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/spotify-2023.csv"
spotify_raw <- read.csv(spotify_url, header = TRUE, stringsAsFactors = FALSE)
# Make into dataframe
spotify_df <- as.data.frame(spotify_raw)
head(spotify_df)
## track_name artist.s._name artist_count
## 1 Seven (feat. Latto) (Explicit Ver.) Latto, Jung Kook 2
## 2 LALA Myke Towers 1
## 3 vampire Olivia Rodrigo 1
## 4 Cruel Summer Taylor Swift 1
## 5 WHERE SHE GOES Bad Bunny 1
## 6 Sprinter Dave, Central Cee 2
## released_year released_month released_day in_spotify_playlists
## 1 2023 7 14 553
## 2 2023 3 23 1474
## 3 2023 6 30 1397
## 4 2019 8 23 7858
## 5 2023 5 18 3133
## 6 2023 6 1 2186
## in_spotify_charts streams in_apple_playlists in_apple_charts
## 1 147 141381703 43 263
## 2 48 133716286 48 126
## 3 113 140003974 94 207
## 4 100 800840817 116 207
## 5 50 303236322 84 133
## 6 91 183706234 67 213
## in_deezer_playlists in_deezer_charts in_shazam_charts bpm key mode
## 1 45 10 826 125 B Major
## 2 58 14 382 92 C# Major
## 3 91 14 949 138 F Major
## 4 125 12 548 170 A Major
## 5 87 15 425 144 A Minor
## 6 88 17 946 141 C# Major
## danceability_. valence_. energy_. acousticness_. instrumentalness_.
## 1 80 89 83 31 0
## 2 71 61 74 7 0
## 3 51 32 53 17 0
## 4 55 58 72 11 0
## 5 65 23 80 14 63
## 6 92 66 58 19 0
## liveness_. speechiness_.
## 1 8 4
## 2 10 4
## 3 31 6
## 4 11 15
## 5 11 6
## 6 8 24
# The data has lots of odd � errors (replacement character; https://charbase.com/fffd-unicode-replacement-character), mostly for Hispanic accents.
library(stringi)
# Convert all non-ASCII characters to spaces in the song name
spotify_df[,1] <- stringi::stri_trans_general(spotify_df[,1], "Latin-ASCII")
spotify_df[,1] <- iconv(spotify_df[,1], to = "ASCII", sub = "")
# Convert all non-ASCII characters to spaces in the artist's name
spotify_df[,2] <- stringi::stri_trans_general(spotify_df[,2], "Latin-ASCII")
spotify_df[,2] <- iconv(spotify_df[,2], to = "ASCII", sub = "")
most_popular_songs_spotifylists<- spotify_df %>%
group_by(released_year, released_month, track_name) %>%
summarise(Spotify_Playlist_Count = sum(in_spotify_playlists)) %>%
arrange(desc(Spotify_Playlist_Count))
## `summarise()` has grouped output by 'released_year', 'released_month'. You can
## override using the `.groups` argument.
head(most_popular_songs_spotifylists)
## # A tibble: 6 × 4
## # Groups: released_year, released_month [5]
## released_year released_month track_name Spotify_Playlist_Count
## <int> <int> <chr> <int>
## 1 2013 1 Get Lucky - Radio Edit 52898
## 2 2003 9 Mr. Brightside 51979
## 3 2013 1 Wake Me Up - Radio Edit 50887
## 4 1991 9 Smells Like Teen Spirit -… 49991
## 5 1984 10 Take On Me 44927
## 6 2019 11 Blinding Lights 43899
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(formattable)
##
## Attaching package: 'formattable'
## The following objects are masked from 'package:scales':
##
## comma, percent, scientific
# Streams column not numeric, change that
spotify_df$streams <- as.numeric(spotify_df$streams)
## Warning: NAs introduced by coercion
# For the one with no value, just use the average amount
spotify_df[575,9] <- colMeans(spotify_df[ ,9, drop =FALSE], na.rm = TRUE)
# Now that streams is a numeric field, we can get the total plays for each song
most_popular_songs_streams<- spotify_df %>%
group_by(released_year, released_month, track_name) %>%
summarise(Stream_Plays = sum(streams)) %>%
arrange(desc(Stream_Plays))
## `summarise()` has grouped output by 'released_year', 'released_month'. You can
## override using the `.groups` argument.
# Just used to make the number legible
my_comma <- scales::label_comma(accuracy = .1, big.mark = ",", decimal.mark = ".")
most_popular_songs_streams$Stream_Plays <- my_comma(most_popular_songs_streams$Stream_Plays)
# Output
head(most_popular_songs_streams)
## # A tibble: 6 × 4
## # Groups: released_year, released_month [6]
## released_year released_month track_name Stream_Plays
## <int> <int> <chr> <chr>
## 1 2019 11 Blinding Lights 3,703,895,0…
## 2 2017 1 Shape of You 3,562,543,8…
## 3 2018 11 Someone You Loved 2,887,241,8…
## 4 2019 5 Dance Monkey 2,864,791,6…
## 5 2018 10 Sunflower - Spider-Man: Into the Sp… 2,808,096,5…
## 6 2016 4 One Dance 2,713,922,3…
diabetes_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/diabetes.csv"
diabetes_raw <- read.csv(diabetes_url, header = TRUE, stringsAsFactors = FALSE)
# Make into dataframe
diabetes_df <- as.data.frame(diabetes_raw)
head(diabetes_df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
diabetes_df <- diabetes_df %>%
mutate(Outcome = ifelse(Outcome == '1', "Positive", "Negative"))
head(diabetes_df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 Positive
## 2 0.351 31 Negative
## 3 0.672 32 Positive
## 4 0.167 21 Negative
## 5 2.288 33 Positive
## 6 0.201 30 Negative
diabetes_df <- diabetes_df %>%
#mutate(across(.cols = Glucose:Age, .fns = ~ifelse(.x == 0, round(mean(.x), digits = 0), .x)))
mutate(across(.cols = Glucose:Age, ~ replace(., . == 0, round(mean(., na.rm = TRUE),0))))
# Didn't mutate to zero-values as it could be men.
head(diabetes_df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 80 33.6
## 2 1 85 66 29 80 26.6
## 3 8 183 64 21 80 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 21 80 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 Positive
## 2 0.351 31 Negative
## 3 0.672 32 Positive
## 4 0.167 21 Negative
## 5 2.288 33 Positive
## 6 0.201 30 Negative
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:formattable':
##
## style
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
g_preg <- ggplot(diabetes_df, aes(x=Pregnancies, fill = Outcome)) +
geom_histogram(binwidth = 0.5) +
theme(legend.position = "none")
g_glucose <- ggplot(diabetes_df, aes(x=Glucose, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_bp <-ggplot(diabetes_df, aes(x=BloodPressure, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_skin <-ggplot(diabetes_df, aes(x=SkinThickness, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_insulin <-ggplot(diabetes_df, aes(x=Insulin, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_bmi <-ggplot(diabetes_df, aes(x=BMI, fill = Outcome)) +
geom_histogram(binwidth = 1) +
theme(legend.position = "none")
g_dpf <-ggplot(diabetes_df, aes(x=DiabetesPedigreeFunction, fill = Outcome)) +
geom_histogram(binwidth = 0.05) +
theme(legend.position = "none")
g_age <-ggplot(diabetes_df, aes(x=Age, fill = Outcome)) +
geom_histogram() +
theme(legend.position = "none")
g_diabetes <-ggplot(diabetes_df, aes(x=Outcome, fill = Outcome)) +
geom_histogram(stat="count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
#-------------------------------------------------------------------------------
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:formattable':
##
## area
# https://patchwork.data-imaginist.com/
(g_preg + g_glucose + g_bp ) /
(g_skin + g_insulin + g_bmi) /
(g_dpf + g_age + g_diabetes)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#categorical_var <- factor(c("Positive", "Negative"))
# Convert the categorical variable to a binary variable
#binary_var <- as.numeric(categorical_var) - 1
# Calculate point-biserial correlation
#correlation <- cor(binary_var, diabetes_df$Pregnancies)
#-------------------------------------------------------------------------------
# ANOVA attempt
#anova_result <- aov(diabetes_df$Pregnancies ~ categorical_var)
#-------------------------------------------------------------------------------
# Base correlations won't work here.
#cor(diabetes_df$Outcome, diabetes_df$Pregnancies)
#cor(diabetes_df$Outcome, diabetes_df$Glucose)
#cor(diabetes_df$Outcome, diabetes_df$BloodPressure)
#cor(diabetes_df$Outcome, diabetes_df$SkinThickness)
#cor(diabetes_df$Outcome, diabetes_df$Insulin)
#cor(diabetes_df$Outcome, diabetes_df$BMI)
#cor(diabetes_df$Outcome, diabetes_df$DiabetesPedigreeFunction)
#cor(diabetes_df$Outcome, diabetes_df$Age)
diabetes_positive <- diabetes_df %>%
filter(Outcome == "Positive")
diabetes_positive %>%
group_by(Glucose, Age, BloodPressure) %>%
arrange(desc(Glucose))
## # A tibble: 268 × 9
## # Groups: Glucose, Age, BloodPressure [268]
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 199 76 43 80 42.9
## 2 0 198 66 32 274 41.3
## 3 2 197 70 45 543 30.5
## 4 8 197 74 21 80 25.9
## 5 2 197 70 99 80 34.7
## 6 7 196 90 21 80 39.8
## 7 8 196 76 29 280 37.5
## 8 1 196 76 36 249 36.5
## 9 7 195 70 33 145 25.1
## 10 6 195 70 21 80 30.9
## # ℹ 258 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <chr>
#——————————————————————————————————- # Folorunsho Atanda
library(tidyverse)
library(RMySQL)
## Loading required package: DBI
library(dotenv)
dotenv::load_dot_env(file = "sql_pw.env")
my_sql_pw <- Sys.getenv("MYSQL_PW")
db <- dbConnect(
MySQL(),
user = "root",
#password = my_sql_pw,
password = "0830spscuny2023!!!",
dbname = "project_2",
host = "localhost",
port = 3306
)
diabetes <- db %>%
dbGetQuery("select * from project_2.diabetes")
diabetes_df <- as_tibble(diabetes)
# Disconnect database
db_status <- dbDisconnect(db)
Because 0 can’t exist in the columns Glucose, BloodPressure, SkinThickness, Insulin, BMI, DPF and Age, going to change does to NA. Then also change 0 to Negative and 1 to Positive in Outcome
columns_to_modify <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin",
"BMI", "DPF", "Age")
diabetes_df <- diabetes_df %>%
mutate(across(all_of(columns_to_modify), ~if_else(. == 0, NA, .))) %>%
mutate("Outcome" = ifelse(`Outcome` == 0, "Negative", "Positive"))
glimpse(diabetes_df)
## Rows: 768
## Columns: 10
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ Pregnancies <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7,…
## $ Glucose <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168,…
## $ BloodPressure <int> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, …
## $ SkinThickness <int> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, …
## $ Insulin <int> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 84…
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA…
## $ DPF <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, …
## $ Age <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, …
## $ Outcome <chr> "Positive", "Negative", "Positive", "Negative", "Positiv…
summary(diabetes_df)
## ID Pregnancies Glucose BloodPressure
## Min. : 1.0 Min. : 0.000 Min. : 44.0 Min. : 24.00
## 1st Qu.:192.8 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00
## Median :384.5 Median : 3.000 Median :117.0 Median : 72.00
## Mean :384.5 Mean : 3.845 Mean :121.7 Mean : 72.41
## 3rd Qu.:576.2 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00
## Max. :768.0 Max. :17.000 Max. :199.0 Max. :122.00
## NA's :5 NA's :35
## SkinThickness Insulin BMI DPF
## Min. : 7.00 Min. : 14.00 Min. :18.20 Min. :0.0780
## 1st Qu.:22.00 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437
## Median :29.00 Median :125.00 Median :32.30 Median :0.3725
## Mean :29.15 Mean :155.55 Mean :32.46 Mean :0.4719
## 3rd Qu.:36.00 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262
## Max. :99.00 Max. :846.00 Max. :67.10 Max. :2.4200
## NA's :227 NA's :374 NA's :11
## Age Outcome
## Min. :21.00 Length:768
## 1st Qu.:24.00 Class :character
## Median :29.00 Mode :character
## Mean :33.24
## 3rd Qu.:41.00
## Max. :81.00
##
diabetes_df %>%
ggplot(aes(x = `Pregnancies`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `Pregnancies`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
) +
labs(title = "Pregnancies vs Outcome")
diabetes_df %>%
select(c(`Pregnancies`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
Pregnancies_mean = mean(`Pregnancies`),
Pregnancies_median = median(`Pregnancies`),
Pregnancies_Q1 = quantile(`Pregnancies`, probs = 0.25),
Pregnancies_Q2 = quantile(`Pregnancies`, probs = 0.75)
)
## # A tibble: 2 × 5
## Outcome Pregnancies_mean Pregnancies_median Pregnancies_Q1 Pregnancies_Q2
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 3.30 2 1 5
## 2 Positive 4.87 4 1.75 8
Analysis: The distribution of
Pregnancies to Outcome is right skewed. This
tells us that the median and mean are not the same. Which is confirmed
in the box plot and the stat table.
diabetes_df %>%
ggplot(aes(x = `Glucose`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `Glucose`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
) +
labs(title = "Glucose vs Outcome")
diabetes_df %>%
select(c(`Glucose`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
Glucose_mean = mean(`Glucose`, na.rm = TRUE),
Glucose_median = median(`Glucose`, na.rm = TRUE),
Glucose_Q1 = quantile(`Glucose`, probs = 0.25, na.rm = TRUE),
Glucose_Q2 = quantile(`Glucose`, probs = 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 5
## Outcome Glucose_mean Glucose_median Glucose_Q1 Glucose_Q2
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 111. 107 93 125
## 2 Positive 142. 140 119 167
Analysis: The distribution of Glucose
to Outcome is symmetrical. This tells us that the median
and mean are very close to one another. Which is confirmed in the box
plot and the stat table. From the stat table we can infer that most of
the data is between 93 and 167.
diabetes_df %>%
ggplot(aes(x = `BloodPressure`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `BloodPressure`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
)
diabetes_df %>%
select(c(`BloodPressure`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
BloodPressure_mean = mean(`BloodPressure`, na.rm = TRUE),
BloodPressure_median = median(`BloodPressure`, na.rm = TRUE),
BloodPressure_Q1 = quantile(`BloodPressure`, probs = 0.25, na.rm = TRUE),
BloodPressure_Q2 = quantile(`BloodPressure`, probs = 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 5
## Outcome BloodPressure_mean BloodPressure_median BloodPressure_Q1
## <chr> <dbl> <dbl> <dbl>
## 1 Negative 70.9 70 62
## 2 Positive 75.3 74.5 68
## # ℹ 1 more variable: BloodPressure_Q2 <dbl>
Analysis: The distribution of
BloodPressure to Outcome is symmetrical. This
tells us that the median and mean are very close to one another. Which
is confirmed in the box plot and the stat table. From the stat table we
can infer that most of the data is between 62 and 84.
diabetes_df %>%
ggplot(aes(x = `SkinThickness`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `SkinThickness`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
)
diabetes_df %>%
select(c(`SkinThickness`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
SkinThickness_mean = mean(`SkinThickness`, na.rm = TRUE),
SkinThickness_median = median(`SkinThickness`, na.rm = TRUE),
SkinThickness_Q1 = quantile(`SkinThickness`, probs = 0.25, na.rm = TRUE),
SkinThickness_Q2 = quantile(`SkinThickness`, probs = 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 5
## Outcome SkinThickness_mean SkinThickness_median SkinThickness_Q1
## <chr> <dbl> <dbl> <dbl>
## 1 Negative 27.2 27 19
## 2 Positive 33 32 27
## # ℹ 1 more variable: SkinThickness_Q2 <dbl>
Analysis: The distribution of
SkinThickness to Outcome is symmetrical. This
tells us that the median and mean are very close to one another. Which
is confirmed in the box plot and the stat table. From the stat table we
can infer that most of the data is between 19 and 39.
diabetes_df %>%
ggplot(aes(x = `Insulin`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `Insulin`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
)
diabetes_df %>%
select(c(`Insulin`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
Insulin_mean = mean(`Insulin`, na.rm = TRUE),
Insulin_median = median(`Insulin`, na.rm = TRUE),
Insulin_Q1 = quantile(`Insulin`, probs = 0.25, na.rm = TRUE),
Insulin_Q2 = quantile(`Insulin`, probs = 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 5
## Outcome Insulin_mean Insulin_median Insulin_Q1 Insulin_Q2
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 130. 102. 66 161.
## 2 Positive 207. 170. 128. 239.
Analysis: The distribution of Insulin
to Outcome is right skewed. This tells us that the median
and mean are not the same. Which is confirmed in the box plot and the
stat table.
diabetes_df %>%
ggplot(aes(x = `BMI`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `BMI`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
)
diabetes_df %>%
select(c(`BMI`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
BMI_mean = mean(`BMI`, na.rm = TRUE),
BMI_median = median(`BMI`, na.rm = TRUE),
BMI_Q1 = quantile(`BMI`, probs = 0.25, na.rm = TRUE),
BMI_Q2 = quantile(`BMI`, probs = 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 5
## Outcome BMI_mean BMI_median BMI_Q1 BMI_Q2
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 30.9 30.1 25.6 35.3
## 2 Positive 35.4 34.3 30.9 38.9
Analysis: The distribution of BMI to
Outcome is symmetrical. This tells us that the median and
mean are very close to one another. Which is confirmed in the box plot
and the stat table. From the stat table we can infer that most of the
data is between 19 and 39.
diabetes_df %>%
ggplot(aes(x = `DPF`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `DPF`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
)
diabetes_df %>%
select(c(`DPF`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
DPF_mean = mean(`DPF`, na.rm = TRUE),
DPF_median = median(`DPF`, na.rm = TRUE),
DPF_Q1 = quantile(`DPF`, probs = 0.25, na.rm = TRUE),
DPF_Q2 = quantile(`DPF`, probs = 0.75, na.rm = TRUE)
)
## # A tibble: 2 × 5
## Outcome DPF_mean DPF_median DPF_Q1 DPF_Q2
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 0.430 0.336 0.230 0.562
## 2 Positive 0.550 0.449 0.262 0.728
Analysis: The distribution of DPF to
Outcome is right skewed. This tells us that the median and
mean are not the same. Which is confirmed in the box plot and the stat
table.
diabetes_df %>%
ggplot(aes(x = `Age`, fill = `Outcome`)) +
geom_bar(
stat = "count",
position = "dodge",
na.rm = TRUE
) +
facet_wrap(~`Outcome`)
diabetes_df %>%
ggplot(aes(x = `Outcome`, y = `Age`)) +
geom_boxplot(
stat = "boxplot",
color = c("red","blue"),
na.rm = TRUE
)
diabetes_df %>%
select(c(`Age`, `Outcome`)) %>%
group_by(`Outcome`) %>%
summarise(
Age_mean = mean(`Age`),
Age_median = median(`Age`),
Age_Q1 = quantile(`Age`, probs = 0.25),
Age_Q2 = quantile(`Age`, probs = 0.75)
)
## # A tibble: 2 × 5
## Outcome Age_mean Age_median Age_Q1 Age_Q2
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Negative 31.2 27 23 37
## 2 Positive 37.1 36 28 44
Analysis: The distribution of Age to
Outcome is right skewed. This tells us that the median and
mean are not the same. Which is confirmed in the box plot and the stat
table.
Because Glucose, BloodPressure, and
BMI have normal distribution, I want to see if there is a
correlation between them, focusing on a positive
diabetes Outcome.
diabetes_cor <- diabetes_df %>%
select(c(`Glucose`, `BloodPressure`, `BMI`, `Outcome`)) %>%
filter(`Outcome` == "Positive")
Glucose_BloodPressure_cor <- cor(
x = as.numeric(diabetes_cor$Glucose),
y = as.numeric(diabetes_cor$BloodPressure),
use = "na.or.complete",
method = "pearson"
)
Glucose_BMI_cor <- cor(
x = as.numeric(diabetes_cor$Glucose),
y = as.numeric(diabetes_cor$BMI),
use = "na.or.complete",
method = "pearson"
)
BloodPressure_BMI_cor <- cor(
x = as.numeric(diabetes_cor$BloodPressure),
y = as.numeric(diabetes_cor$BMI),
use = "na.or.complete",
method = "pearson"
)
correlation_df <- tibble(
Relationship = c("Glucose vs Blood Pressure", "Glucose vs BMI",
"Blood Pressure vs BMI"),
Correlation = c(Glucose_BloodPressure_cor, Glucose_BMI_cor,
BloodPressure_BMI_cor)
)
correlation_df
## # A tibble: 3 × 2
## Relationship Correlation
## <chr> <dbl>
## 1 Glucose vs Blood Pressure 0.101
## 2 Glucose vs BMI 0.0566
## 3 Blood Pressure vs BMI 0.249
Analysis: From the table we see that there is no correlation between these three variables.
#——————————————————————————————————-
I have decided to work with the New York Prison Employee Discipline Data from The Marshall Project Found here: (https://observablehq.com/@themarshallproject/new-york-prison-employee-discipline-data)
The data is originally retrieved from the NY State Department of Corrections and Community Supervision
employee_discipline_url <- "https://raw.githubusercontent.com/xhuliaturkaj/Project_2/main/NYPrisonEmployeeDiscipline.csv"
employee_discipline <- read.csv(employee_discipline_url, header = TRUE)
head(employee_discipline)
## LNAME FNAME TITLE FACILITY UNION MISCONDUCT CLOSED
## 1 STEVENSON SYLBERN CO EDGECOMBE NYSCOPBA FD, NG 01-Jul-20
## 2 BODDEN MICHAEL CO GREEN HAVEN NYSCOPBA IA, FD 01-Jul-20
## 3 GIVANS-DANIELS GLENDA CO BEDFORD HILLS NYSCOPBA DI 06-Jul-20
## 4 ROUANTREE KEITH CO GREEN HAVEN NYSCOPBA FD, IA, NG 06-Jul-20
## 5 MELENDEZ IRIS CO COXSACKIE NYSCOPBA IN 06-Jul-20
## 6 BUSH JONATHAN ASAT-PA ALBION PEF IN 08-Jul-20
## REASON PENALTY
## 1 NOD DISMISSAL & ACCRUALS
## 2 NOD DISMISSAL & ACCRUALS
## 3 NOD DISMISSAL & ACCRUALS
## 4 NOD DISMISSAL & ACCRUALS
## 5 NOD DISMISSAL & ACCRUALS
## 6 NOD 30D SP
## Field0
## 1 MADE FALSE/MISLEADING ENTRIES INTO LOGBOOK. DID NOT PERFORM SECURITY ROUNDS ON 5/16/20, 5/17/20 AND 5/19/20 . FAILED TO MAINTAIN SECURE CONTROL OF SECURITY EQUIPMENT (LOST KEY RING AND COULD NOT LOCATE IT)
## 2 ENGAGED IN UNJUSTIFIED AND EXCESSIVE USE OF FORCE ON AN INMATE, THREATENED INMATE, MADE FALSE AND/OR MISLEADING INFORMATION REGARDING THE USE OF FORCE INCIDENCES
## 3 ENGAGED IN RELATIONSHIP W/INMATES/PAROLEES HAVING OVER 500 TELEPHONE CALLS STATEING CONVERSATIONS NOT IN FURTHERANCE OF WORK RELATED OBJECTIVES & W/O PERMISSION TO DO SO FROM DOCCS.
## 4 FAILED TO DETECT AND/OR TAK ANY ACTION REGARDING AN INMATE ENGAGED IN IMPROPER ACTIVITIES (REMOVING A FEEDING TUBE FROM HIS NOSE), FAILED TO TAKE ACTION TO ADDRESS ANOTHER EMPLOYEE
## 5 REFUSED DIRECT ORDER FROM SGT. TO ASSUME DUTIES OF C-3 DIVISION WHILE YELLING REFUSAL TO SGT. REFUSED DIRECT ORDER FROM LT. TO ASSUME DUTIES WHILE YELLING REFUSAL TO LT. REFUSED DIRECT ORDER FROM SGT. TO ASSUME DUTIES OF RMHU ESCORTWHILE YELLING REFUSAL T
## 6 INSUBORDINATE REFUSED TO COMMUNICATE PROPERLY WHEN ASKED IF INFORMATION WAS CORRECT
## DISPODT DISPO PENALTYDIS ARBITRATOR PERCLOSED page_num
## 1 7/28/2020 S $2,000, 12M DEP Y 1
## 2 4/30/2021 R RESIGNING EFFECTINVE 4/30/21 Y 1
## 3 10/1/2020 S 3M SP, 12M DEP Y 1
## 4 7/7/2020 S 3W SP, 12M DEP Y 1
## 5 7/24/2020 S 3W SP Y 1
## 6 10/7/2020 S 1W SP, 5D, 18M DEP Y 1
unique_names <- employee_discipline %>%
group_by(LNAME, FNAME) %>%
summarize(count = n())
## `summarise()` has grouped output by 'LNAME'. You can override using the
## `.groups` argument.
n_unique_combinations <- nrow(unique_names)
print(n_unique_combinations)
## [1] 953
There is a total of 953 unique combinations which means there are 953 employees included in this data set. Let’s format the data frame in a tidy format.
#First I will create a new column that represents first and last names and will get rid
# of the separate first and last name columns.
employee_discipline <- employee_discipline %>%
mutate(FULL_NAME = paste(LNAME, FNAME)) %>%
select(FULL_NAME, everything(), -LNAME, -FNAME)
#Let's run distinct one more time on the composed FULL_NAME variable
length(unique(employee_discipline$FULL_NAME))
## [1] 953
#We again confirmed that there are 953 unique observations/employees in this data frame
#First remove the columns that we are not interested in
employee_discipline <- employee_discipline %>%
select(-c(UNION, CLOSED, REASON, Field0, DISPODT, DISPO, PENALTYDIS, ARBITRATOR, PERCLOSED, page_num ))
#Convert the other columns to factor
employee_discipline <- employee_discipline %>%
mutate_at(vars(FACILITY, TITLE, MISCONDUCT, PENALTY), as.factor)
#Next let's create a unique identifier for each Employee
employee_discipline <- employee_discipline %>%
group_by(FULL_NAME) %>%
mutate(employee_id = cur_group_id()) %>%
ungroup()
#Make sure there are 953 employee ids
length(unique(employee_discipline$employee_id))
## [1] 953
head(employee_discipline)
## # A tibble: 6 × 6
## FULL_NAME TITLE FACILITY MISCONDUCT PENALTY employee_id
## <chr> <fct> <fct> <fct> <fct> <int>
## 1 STEVENSON SYLBERN CO EDGECOMBE FD, NG DISMISSAL … 853
## 2 BODDEN MICHAEL CO GREEN HAVEN IA, FD DISMISSAL … 132
## 3 GIVANS-DANIELS GLENDA CO BEDFORD HILLS DI DISMISSAL … 395
## 4 ROUANTREE KEITH CO GREEN HAVEN FD, IA, NG DISMISSAL … 792
## 5 MELENDEZ IRIS CO COXSACKIE IN DISMISSAL … 629
## 6 BUSH JONATHAN ASAT-PA ALBION IN 30D SP 179
head(employee_discipline)
## # A tibble: 6 × 6
## FULL_NAME TITLE FACILITY MISCONDUCT PENALTY employee_id
## <chr> <fct> <fct> <fct> <fct> <int>
## 1 STEVENSON SYLBERN CO EDGECOMBE FD, NG DISMISSAL … 853
## 2 BODDEN MICHAEL CO GREEN HAVEN IA, FD DISMISSAL … 132
## 3 GIVANS-DANIELS GLENDA CO BEDFORD HILLS DI DISMISSAL … 395
## 4 ROUANTREE KEITH CO GREEN HAVEN FD, IA, NG DISMISSAL … 792
## 5 MELENDEZ IRIS CO COXSACKIE IN DISMISSAL … 629
## 6 BUSH JONATHAN ASAT-PA ALBION IN 30D SP 179