library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(RPostgres)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dbplyr::ident() masks dplyr::ident()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dbplyr::sql() masks dplyr::sql()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(geojsonio)
## Registered S3 method overwritten by 'geojsonsf':
## method from
## print.geojson geojson
##
## Attaching package: 'geojsonio'
##
## The following object is masked from 'package:base':
##
## pretty
library(RCurl)
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
library(tidyr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(geojsonR)
library(ggplot2)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(RMySQL)
## Loading required package: DBI
For our final project we will be exploring the relationship between obesity rates and socioeconomic factors. Obesity, a significant public health concern, is associated with various chronic conditions. By analyzing publicly available data sets, we will investigate how variables such as median state income and the density of fast-food restaurants correlate with obesity rates. Through this analysis, we seek to better understand the drivers of obesity and contribute to informed strategies for improving public health outcomes.
Establish connection to the MySQL database.
vectorpass <- c('t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x','j','u','m','p','o','v','e','r','l','a','z','y','d','o','g','0','1','2','3','4','5','6','7','8','9',".")
#list of alphanumeric characters to make the password into a simple code.
pass = paste(vectorpass[8],vectorpass[3],vectorpass[6],vectorpass[1],vectorpass[2],vectorpass[42],vectorpass[29],vectorpass[3],vectorpass[13],vectorpass[6],vectorpass[22],vectorpass[11],vectorpass[36],vectorpass[41],sep="")
#constructing the password from the list
mysqlconnection <- dbConnect(RMySQL::MySQL(),
host="cunydata607sql.mysql.database.azure.com",
dbname = "keith.denivo49",
port=3306,
user="keith.denivo49",
password=pass)
#connecting to database.
src_dbi(mysqlconnection)
## src: mysql 8.0.35 [@cunydata607sql.mysql.database.azure.com:/keith.denivo49]
## tbls: job_names_tidy, job_names_untidy, merged_df, number_restaurants_df,
## restaurant_df, soft_skills, soft_skills_tidy, state_info_df,
## state_obesity_df, state_obesity_info_df, technical_skills
#lists all the tables in the database with tbls.
Retrieve and preprocess obesity data from a GeoJSON file.
c <- getURL("https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/National_Obesity_By_State.geojson")
#geojson file stored temporary
tmp_geojson <- tempfile(fileext = ".geojson")
download.file(
"https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/National_Obesity_By_State.geojson",
tmp_geojson
)
#use sf library to read geojson
obesity_geojson_sf <- read_sf(tmp_geojson)
#graph geojson
ggplot(obesity_geojson_sf) +
geom_sf(fill = "#69b3a2", color = "black") +
theme_void()
glimpse(obesity_geojson_sf)
## Rows: 52
## Columns: 6
## $ FID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ NAME <chr> "Texas", "California", "Kentucky", "Georgia", "Wisconsin"…
## $ Obesity <dbl> 32.4, 24.2, 34.6, 30.7, 30.7, 30.1, 29.2, 33.8, 36.2, 25.…
## $ SHAPE_Length <dbl> 45.04745, 40.75593, 20.34835, 17.28845, 22.42884, 24.5650…
## $ SHAPE_Area <dbl> 65.778624, 41.730164, 10.665636, 14.682554, 16.512937, 28…
## $ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-106.6235 3..., MULTIPOLYGON…
class(obesity_geojson_sf)
## [1] "sf" "tbl_df" "tbl" "data.frame"
#extract the obesity % for each state
state_obesity_df <- obesity_geojson_sf |>
select(NAME, Obesity)
state_obesity_df <- as.data.frame(state_obesity_df)
state_obesity_df <- subset(state_obesity_df, select = -c(geometry) )
glimpse(state_obesity_df)
## Rows: 52
## Columns: 2
## $ NAME <chr> "Texas", "California", "Kentucky", "Georgia", "Wisconsin", "Or…
## $ Obesity <dbl> 32.4, 24.2, 34.6, 30.7, 30.7, 30.1, 29.2, 33.8, 36.2, 25.0, 31…
Read in state information dataset.
state_info_df <- read_csv("https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/Population%20%26%20Abbreviation%20%26%20Income.csv")
## Rows: 51 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abreviation
## dbl (1): Median Income
## num (1): 2023 Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(state_info_df)
## Rows: 51
## Columns: 4
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califor…
## $ Abreviation <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC"…
## $ `2023 Population` <dbl> 5108468, 733406, 7431344, 3067732, 38965193, 5877610…
## $ `Median Income` <dbl> 53992, 77845, 69056, 52528, 84907, 82254, 83771, 710…
Merge obesity and state information datasets.
colnames(state_obesity_df)[colnames(state_obesity_df) == 'NAME'] <- 'State'
state_obesity_info_df <- merge(state_info_df,state_obesity_df,by="State")
glimpse(state_obesity_info_df)
## Rows: 51
## Columns: 5
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califor…
## $ Abreviation <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC"…
## $ `2023 Population` <dbl> 5108468, 733406, 7431344, 3067732, 38965193, 5877610…
## $ `Median Income` <dbl> 53992, 77845, 69056, 52528, 84907, 82254, 83771, 710…
## $ Obesity <dbl> 35.6, 29.8, 28.4, 34.5, 24.2, 20.2, 25.3, 29.7, 22.1…
Analyze the relationship between obesity rates and median income.
#Obesity % vs Median income
ggplot(state_obesity_info_df, aes(x = state_obesity_info_df$`Median Income`, y = state_obesity_info_df$Obesity))+
geom_point()+
geom_smooth(method= lm)
## Warning: Use of `` state_obesity_info_df$`Median Income` `` is discouraged.
## ℹ Use `Median Income` instead.
## Warning: Use of `state_obesity_info_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.
## Warning: Use of `` state_obesity_info_df$`Median Income` `` is discouraged.
## ℹ Use `Median Income` instead.
## Warning: Use of `state_obesity_info_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.
## `geom_smooth()` using formula = 'y ~ x'
mod1 = lm(formula = state_obesity_info_df$Obesity ~ state_obesity_info_df$'Median Income', data = state_obesity_info_df)
summary(mod1)
##
## Call:
## lm(formula = state_obesity_info_df$Obesity ~ state_obesity_info_df$"Median Income",
## data = state_obesity_info_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2855 -1.3561 0.6406 1.5842 5.2207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.779e+01 2.140e+00 22.333 < 2e-16
## state_obesity_info_df$"Median Income" -2.674e-04 3.051e-05 -8.763 1.34e-11
##
## (Intercept) ***
## state_obesity_info_df$"Median Income" ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.441 on 49 degrees of freedom
## Multiple R-squared: 0.6104, Adjusted R-squared: 0.6025
## F-statistic: 76.78 on 1 and 49 DF, p-value: 1.335e-11
Read in fast-food restaurant data and calculate the number of fast-food restaurants per state.
restaurant_df <- read_csv("https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/Fast_Food_Restaurants.csv")
## Rows: 10000 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): id, address, categories, city, country, keys, name, postalCode, p...
## dbl (3): index, latitude, longitude
## dttm (2): dateAdded, dateUpdated
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique(restaurant_df$province)
## [1] "LA" "TN" "GA" "MI" "CA" "NY" "TX" "OH" "SC" "MS" "IN" "MA" "FL" "MO" "PA"
## [16] "IL" "CT" "KY" "KS" "IA" "HI" "ID" "NC" "DE" "AZ" "WI" "CO" "AR" "SD" "WV"
## [31] "WY" "VA" "OR" "MD" "WA" "UT" "NV" "OK" "ME" "ND" "NH" "VT" "NJ" "NE" "MN"
## [46] "MT" "AL" "NM" "AK" "RI"
number_restaurants_df <- as.data.frame(table(unlist(restaurant_df$province)))
conn <- mysqlconnection
con <- mysqlconnection
#table was created
dbWriteTable(conn = con,
name = "restaurant_df",
value = restaurant_df,
overwrite = TRUE)
## [1] TRUE
dbWriteTable(conn = con,
name = "state_info_df",
value = state_info_df,
overwrite = TRUE)
## [1] TRUE
dbWriteTable(conn = con,
name = "state_obesity_df",
value = state_obesity_df,
overwrite = TRUE)
## [1] TRUE
dbWriteTable(conn = con,
name = "number_restaurants_df",
value = number_restaurants_df,
overwrite = TRUE)
## [1] TRUE
dbWriteTable(conn = con,
name = "state_obesity_info_df",
value = state_obesity_info_df,
overwrite = TRUE)
## [1] TRUE
src_dbi(mysqlconnection)
## src: mysql 8.0.35 [@cunydata607sql.mysql.database.azure.com:/keith.denivo49]
## tbls: job_names_tidy, job_names_untidy, merged_df, number_restaurants_df,
## restaurant_df, soft_skills, soft_skills_tidy, state_info_df,
## state_obesity_df, state_obesity_info_df, technical_skills
Merge obesity and restaurant datasets.
names(number_restaurants_df)[1] <- "Abreviation"
names(number_restaurants_df)[2] <- "number_restaurants"
state_obesity_info_restaurant_df <- merge(state_obesity_info_df,number_restaurants_df,by="Abreviation")
state_obesity_info_restaurant_df <- state_obesity_info_restaurant_df %>%
mutate(restaurant_per_capita = number_restaurants / `2023 Population`)
Calculate restaurants per capita and analyze the relationship with obesity rates.
#Obesity % vs restaurants per capita
ggplot(state_obesity_info_restaurant_df, aes(x = state_obesity_info_restaurant_df$restaurant_per_capita, y = state_obesity_info_restaurant_df$Obesity))+
geom_point()+
geom_smooth(method= lm)
## Warning: Use of `state_obesity_info_restaurant_df$restaurant_per_capita` is discouraged.
## ℹ Use `restaurant_per_capita` instead.
## Warning: Use of `state_obesity_info_restaurant_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.
## Warning: Use of `state_obesity_info_restaurant_df$restaurant_per_capita` is discouraged.
## ℹ Use `restaurant_per_capita` instead.
## Warning: Use of `state_obesity_info_restaurant_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.
## `geom_smooth()` using formula = 'y ~ x'
mod2 = lm(formula = state_obesity_info_restaurant_df$Obesity ~ state_obesity_info_restaurant_df$restaurant_per_capita, data = state_obesity_info_restaurant_df)
summary(mod2)
##
## Call:
## lm(formula = state_obesity_info_restaurant_df$Obesity ~ state_obesity_info_restaurant_df$restaurant_per_capita,
## data = state_obesity_info_restaurant_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6388 -2.4296 0.0322 1.8969 9.2473
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 2.623e+01 1.662e+00
## state_obesity_info_restaurant_df$restaurant_per_capita 1.036e+05 5.120e+04
## t value Pr(>|t|)
## (Intercept) 15.783 <2e-16 ***
## state_obesity_info_restaurant_df$restaurant_per_capita 2.023 0.0487 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.657 on 48 degrees of freedom
## Multiple R-squared: 0.07854, Adjusted R-squared: 0.05935
## F-statistic: 4.091 on 1 and 48 DF, p-value: 0.04869
Based on these findings, we can conclude that there is a statistically significant negative relationship between median income and obesity rate. The higher the median income, the lower the obesity rate are. However, it’s important to note that while this relationship is statistically significant, the coefficient for “Median Income” is very small, suggesting that other factors may also play a significant role in determining obesity rates.
We can also conclude that there is a statistically significant positive relationship between restaurants per capita and obesity rate.