Load libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(dbplyr)

## 
## Attaching package: 'dbplyr'

## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

library(RPostgres)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dbplyr::ident() masks dplyr::ident()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dbplyr::sql()   masks dplyr::sql()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(geojsonio)

## Registered S3 method overwritten by 'geojsonsf':
##   method        from   
##   print.geojson geojson
## 
## Attaching package: 'geojsonio'
## 
## The following object is masked from 'package:base':
## 
##     pretty

library(RCurl)

## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

library(tidyr)
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(geojsonR)
library(ggplot2)
library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(Hmisc)

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(RMySQL)

## Loading required package: DBI

Introduction:

For our final project we will be exploring the relationship between obesity rates and socioeconomic factors. Obesity, a significant public health concern, is associated with various chronic conditions. By analyzing publicly available data sets, we will investigate how variables such as median state income and the density of fast-food restaurants correlate with obesity rates. Through this analysis, we seek to better understand the drivers of obesity and contribute to informed strategies for improving public health outcomes.

Establish connection to the MySQL database.

vectorpass <- c('t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x','j','u','m','p','o','v','e','r','l','a','z','y','d','o','g','0','1','2','3','4','5','6','7','8','9',".")
#list of alphanumeric characters to make the password into a simple code.

pass = paste(vectorpass[8],vectorpass[3],vectorpass[6],vectorpass[1],vectorpass[2],vectorpass[42],vectorpass[29],vectorpass[3],vectorpass[13],vectorpass[6],vectorpass[22],vectorpass[11],vectorpass[36],vectorpass[41],sep="")
#constructing the password from the list

mysqlconnection <- dbConnect(RMySQL::MySQL(),

                             host="cunydata607sql.mysql.database.azure.com",

                             dbname = "keith.denivo49",

                             port=3306,

                             user="keith.denivo49",

                             password=pass)
#connecting to database.
src_dbi(mysqlconnection)

## src:  mysql 8.0.35 [@cunydata607sql.mysql.database.azure.com:/keith.denivo49]
## tbls: job_names_tidy, job_names_untidy, merged_df, number_restaurants_df,
##   restaurant_df, soft_skills, soft_skills_tidy, state_info_df,
##   state_obesity_df, state_obesity_info_df, technical_skills

#lists all the tables in the database with tbls.

Retrieve and preprocess obesity data from a GeoJSON file.

c <- getURL("https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/National_Obesity_By_State.geojson")


#geojson file stored temporary
tmp_geojson <- tempfile(fileext = ".geojson")
download.file(
  "https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/National_Obesity_By_State.geojson",
  tmp_geojson
)

#use sf library to read geojson

obesity_geojson_sf <- read_sf(tmp_geojson)

#graph geojson
ggplot(obesity_geojson_sf) +
  geom_sf(fill = "#69b3a2", color = "black") +
  theme_void()

glimpse(obesity_geojson_sf)

## Rows: 52
## Columns: 6
## $ FID          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ NAME         <chr> "Texas", "California", "Kentucky", "Georgia", "Wisconsin"…
## $ Obesity      <dbl> 32.4, 24.2, 34.6, 30.7, 30.7, 30.1, 29.2, 33.8, 36.2, 25.…
## $ SHAPE_Length <dbl> 45.04745, 40.75593, 20.34835, 17.28845, 22.42884, 24.5650…
## $ SHAPE_Area   <dbl> 65.778624, 41.730164, 10.665636, 14.682554, 16.512937, 28…
## $ geometry     <MULTIPOLYGON [°]> MULTIPOLYGON (((-106.6235 3..., MULTIPOLYGON…

class(obesity_geojson_sf)

## [1] "sf"         "tbl_df"     "tbl"        "data.frame"

#extract the obesity % for each state
state_obesity_df <- obesity_geojson_sf |> 
  select(NAME, Obesity)
state_obesity_df <- as.data.frame(state_obesity_df)
state_obesity_df <-  subset(state_obesity_df, select = -c(geometry) )
glimpse(state_obesity_df)

## Rows: 52
## Columns: 2
## $ NAME    <chr> "Texas", "California", "Kentucky", "Georgia", "Wisconsin", "Or…
## $ Obesity <dbl> 32.4, 24.2, 34.6, 30.7, 30.7, 30.1, 29.2, 33.8, 36.2, 25.0, 31…

Read in state information dataset.

state_info_df <- read_csv("https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/Population%20%26%20Abbreviation%20%26%20Income.csv")

## Rows: 51 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abreviation
## dbl (1): Median Income
## num (1): 2023 Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(state_info_df)

## Rows: 51
## Columns: 4
## $ State             <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califor…
## $ Abreviation       <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC"…
## $ `2023 Population` <dbl> 5108468, 733406, 7431344, 3067732, 38965193, 5877610…
## $ `Median Income`   <dbl> 53992, 77845, 69056, 52528, 84907, 82254, 83771, 710…

Merge obesity and state information datasets.

colnames(state_obesity_df)[colnames(state_obesity_df) == 'NAME'] <- 'State'

state_obesity_info_df <- merge(state_info_df,state_obesity_df,by="State")
glimpse(state_obesity_info_df)

## Rows: 51
## Columns: 5
## $ State             <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califor…
## $ Abreviation       <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC"…
## $ `2023 Population` <dbl> 5108468, 733406, 7431344, 3067732, 38965193, 5877610…
## $ `Median Income`   <dbl> 53992, 77845, 69056, 52528, 84907, 82254, 83771, 710…
## $ Obesity           <dbl> 35.6, 29.8, 28.4, 34.5, 24.2, 20.2, 25.3, 29.7, 22.1…

Analyze the relationship between obesity rates and median income.

#Obesity % vs Median income
ggplot(state_obesity_info_df, aes(x = state_obesity_info_df$`Median Income`, y = state_obesity_info_df$Obesity))+
  geom_point()+
  geom_smooth(method= lm)

## Warning: Use of `` state_obesity_info_df$`Median Income` `` is discouraged.
## ℹ Use `Median Income` instead.

## Warning: Use of `state_obesity_info_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.

## Warning: Use of `` state_obesity_info_df$`Median Income` `` is discouraged.
## ℹ Use `Median Income` instead.

## Warning: Use of `state_obesity_info_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.

## `geom_smooth()` using formula = 'y ~ x'

mod1 = lm(formula = state_obesity_info_df$Obesity ~ state_obesity_info_df$'Median Income', data = state_obesity_info_df)
summary(mod1)

## 
## Call:
## lm(formula = state_obesity_info_df$Obesity ~ state_obesity_info_df$"Median Income", 
##     data = state_obesity_info_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2855 -1.3561  0.6406  1.5842  5.2207 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            4.779e+01  2.140e+00  22.333  < 2e-16
## state_obesity_info_df$"Median Income" -2.674e-04  3.051e-05  -8.763 1.34e-11
##                                          
## (Intercept)                           ***
## state_obesity_info_df$"Median Income" ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.441 on 49 degrees of freedom
## Multiple R-squared:  0.6104, Adjusted R-squared:  0.6025 
## F-statistic: 76.78 on 1 and 49 DF,  p-value: 1.335e-11

Read in fast-food restaurant data and calculate the number of fast-food restaurants per state.

restaurant_df <- read_csv("https://raw.githubusercontent.com/amedina613/MSDS-Data607-Final-Project/main/Fast_Food_Restaurants.csv")

## Rows: 10000 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (11): id, address, categories, city, country, keys, name, postalCode, p...
## dbl   (3): index, latitude, longitude
## dttm  (2): dateAdded, dateUpdated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

unique(restaurant_df$province)

##  [1] "LA" "TN" "GA" "MI" "CA" "NY" "TX" "OH" "SC" "MS" "IN" "MA" "FL" "MO" "PA"
## [16] "IL" "CT" "KY" "KS" "IA" "HI" "ID" "NC" "DE" "AZ" "WI" "CO" "AR" "SD" "WV"
## [31] "WY" "VA" "OR" "MD" "WA" "UT" "NV" "OK" "ME" "ND" "NH" "VT" "NJ" "NE" "MN"
## [46] "MT" "AL" "NM" "AK" "RI"

number_restaurants_df <- as.data.frame(table(unlist(restaurant_df$province)))

Upload the data frames to the database:

conn <-  mysqlconnection
con <- mysqlconnection

#table was created

dbWriteTable(conn = con,
             name = "restaurant_df",
             value = restaurant_df,            
             overwrite = TRUE)

## [1] TRUE

dbWriteTable(conn = con,
             name = "state_info_df",
             value = state_info_df,            
             overwrite = TRUE)

## [1] TRUE

dbWriteTable(conn = con,
             name = "state_obesity_df",
             value = state_obesity_df,           
             overwrite = TRUE)

## [1] TRUE

dbWriteTable(conn = con,
             name = "number_restaurants_df",
             value = number_restaurants_df,            
             overwrite = TRUE)

## [1] TRUE

dbWriteTable(conn = con,
             name = "state_obesity_info_df",
             value = state_obesity_info_df,            
             overwrite = TRUE)

## [1] TRUE

src_dbi(mysqlconnection)

## src:  mysql 8.0.35 [@cunydata607sql.mysql.database.azure.com:/keith.denivo49]
## tbls: job_names_tidy, job_names_untidy, merged_df, number_restaurants_df,
##   restaurant_df, soft_skills, soft_skills_tidy, state_info_df,
##   state_obesity_df, state_obesity_info_df, technical_skills

Merge obesity and restaurant datasets.

names(number_restaurants_df)[1] <- "Abreviation"
names(number_restaurants_df)[2] <- "number_restaurants"
state_obesity_info_restaurant_df <- merge(state_obesity_info_df,number_restaurants_df,by="Abreviation")

state_obesity_info_restaurant_df <- state_obesity_info_restaurant_df %>%
   mutate(restaurant_per_capita = number_restaurants / `2023 Population`)

Calculate restaurants per capita and analyze the relationship with obesity rates.

#Obesity % vs restaurants per capita
ggplot(state_obesity_info_restaurant_df, aes(x = state_obesity_info_restaurant_df$restaurant_per_capita, y = state_obesity_info_restaurant_df$Obesity))+
  geom_point()+
  geom_smooth(method= lm)

## Warning: Use of `state_obesity_info_restaurant_df$restaurant_per_capita` is discouraged.
## ℹ Use `restaurant_per_capita` instead.

## Warning: Use of `state_obesity_info_restaurant_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.

## Warning: Use of `state_obesity_info_restaurant_df$restaurant_per_capita` is discouraged.
## ℹ Use `restaurant_per_capita` instead.

## Warning: Use of `state_obesity_info_restaurant_df$Obesity` is discouraged.
## ℹ Use `Obesity` instead.

## `geom_smooth()` using formula = 'y ~ x'

mod2 = lm(formula = state_obesity_info_restaurant_df$Obesity ~ state_obesity_info_restaurant_df$restaurant_per_capita, data = state_obesity_info_restaurant_df)
summary(mod2)

## 
## Call:
## lm(formula = state_obesity_info_restaurant_df$Obesity ~ state_obesity_info_restaurant_df$restaurant_per_capita, 
##     data = state_obesity_info_restaurant_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6388 -2.4296  0.0322  1.8969  9.2473 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                            2.623e+01  1.662e+00
## state_obesity_info_restaurant_df$restaurant_per_capita 1.036e+05  5.120e+04
##                                                        t value Pr(>|t|)    
## (Intercept)                                             15.783   <2e-16 ***
## state_obesity_info_restaurant_df$restaurant_per_capita   2.023   0.0487 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.657 on 48 degrees of freedom
## Multiple R-squared:  0.07854,    Adjusted R-squared:  0.05935 
## F-statistic: 4.091 on 1 and 48 DF,  p-value: 0.04869

Conclusion:

Based on these findings, we can conclude that there is a statistically significant negative relationship between median income and obesity rate. The higher the median income, the lower the obesity rate are. However, it’s important to note that while this relationship is statistically significant, the coefficient for “Median Income” is very small, suggesting that other factors may also play a significant role in determining obesity rates.

We can also conclude that there is a statistically significant positive relationship between restaurants per capita and obesity rate.

Data 607 final project

Keith DeNivo

2024-04-29

Load libraries

Introduction:

Upload the data frames to the database:

Conclusion: