library(readr)
healthny <- read_csv("C:/Users/Nusrat/Desktop/MA - 3rd semester, Spring 19/SOC 712 - Advanced Analytics (R)/Assignment 2 - Data/healthny.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   County = col_character(),
##   TotDIabetics = col_logical()
## )
## See spec(...) for full column specifications.
head(healthny)
## # A tibble: 6 x 25
##   County PhysicallyUnhea~ MentallyUnhealt~ FairOrPoorHealth
##   <chr>             <dbl>            <dbl>            <dbl>
## 1 Alban~              3.3              3.4             12.2
## 2 Alleg~              3.6              3.7             13.5
## 3 Bronx~              5                4.3             28.7
## 4 Broom~              3.6              3.7             13.7
## 5 Catta~              3.7              3.8             13.9
## 6 Cayug~              3.3              3.5             12.4
## # ... with 21 more variables: TotHealthCareCosts <dbl>,
## #   PeopleWithoutInsuranceUnder19 <dbl>,
## #   PeopleWithoutInsurance19to64 <dbl>,
## #   PeopleWithoutInsuranceUnder65 <dbl>,
## #   TotPeopleWithoutInsuranceUnder19 <dbl>,
## #   TotPeopleWithoutInsurance19to64 <dbl>,
## #   TotPeopleWithoutInsuranceUnder65 <dbl>, Diabetics <dbl>,
## #   DiabeticMedicareEnrolleesReceivingHba1cTest <dbl>, TotDIabetics <lgl>,
## #   TotDiabeticMedicareEnrolleesReceivingHba1cTest <dbl>,
## #   TotTeenBirthsFemales15to19 <dbl>, TotChlamydiaCases <dbl>,
## #   TotHIVPrevalance <dbl>, CurrentSmokers18andup <dbl>,
## #   DrinkingAdults18andup <dbl>, LimitedAccessToHealthyFoods <dbl>,
## #   AccessToExerciseOpportunities <dbl>, ObesePersons20orOlder <dbl>,
## #   PhysicallyInactivePersons <dbl>, ChildrenWithAccessToFreeLunch <dbl>
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
select(healthny, PhysicallyUnhealthyDaysPerMonth, AccessToExerciseOpportunities, Diabetics, ObesePersons20orOlder, LimitedAccessToHealthyFoods)
## # A tibble: 62 x 5
##    PhysicallyUnhea~ AccessToExercis~ Diabetics ObesePersons20o~
##               <dbl>            <dbl>     <dbl>            <dbl>
##  1              3.3             85.6       8.6             27.2
##  2              3.6             48.6       8.9             29.8
##  3              5               98.6      14.7             28.9
##  4              3.6             87.1       9.1             24.7
##  5              3.7             62.5       9.7             28.4
##  6              3.3             72.4       8.8             30.3
##  7              3.6             68.8       9.9             29.8
##  8              3.4             74.4       9.2             31.9
##  9              3.5             66.0       9.7             32.8
## 10              3.4             66.8       8.5             29.6
## # ... with 52 more rows, and 1 more variable:
## #   LimitedAccessToHealthyFoods <dbl>
summarize(healthny, AvgPhysicallyUnhealthy = mean(PhysicallyUnhealthyDaysPerMonth, na.rm = TRUE))
## # A tibble: 1 x 1
##   AvgPhysicallyUnhealthy
##                    <dbl>
## 1                   3.41
library(ggplot2)
ggplot(data =healthny, aes(AccessToExerciseOpportunities, Diabetics))+geom_point()+labs(title = "Diabetics and Exercise")

cor(healthny$AccessToExerciseOpportunities, healthny$Diabetics)
## [1] 0.1837253
library(ggplot2)
ggplot(data =healthny, aes(LimitedAccessToHealthyFoods, ObesePersons20orOlder))+geom_point()+labs(title = "Obesity and Poor Diet")

cor(healthny$LimitedAccessToHealthyFoods, healthny$ObesePersons20orOlder)
## [1] 0.09174848
library(readr)
healthnyKingsvsQueens <- read_csv("C:/Users/Nusrat/Desktop/MA - 3rd semester, Spring 19/SOC 712 - Advanced Analytics (R)/Assignment 2 - Data/healthnyKingsvsQueens.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   County = col_character(),
##   TotDIabetics = col_logical()
## )
## See spec(...) for full column specifications.
healthnyKingsvsQueens$County <-factor(healthnyKingsvsQueens$County, levels = healthnyKingsvsQueens$County[order(healthnyKingsvsQueens$Diabetics)])
ggplot(data = healthnyKingsvsQueens, aes(x=County, y=Diabetics, fill=County))+
    geom_bar(stat = "identity")+
    xlab("County")+
    ylab("Diabetics")+
    theme(axis.text.x = element_blank())+
    theme(legend.position = "right")

library(ggplot2)
ggplot(data = healthnyKingsvsQueens, aes(x=County, y=ObesePersons20orOlder, fill=County))+
    geom_bar(stat = "identity")+
    xlab("County")+
    ylab("Obesity")+
    theme(axis.text.x = element_blank())+
    theme(legend.position = "right")

Report

For this assignment, I chose Health Data (2016) from Social Explorer. The purpose was to see how different health factors of people compare across New York counties.

The focus of this analysis was primarily on the variables diabetics and obesity. Results suggest that on average the participants reported being physically unhealthy for approximately three and a half days (3.41) per month

Next, I created a scatterplot to see how persons with diabetics and access to exercise opportunities correlate. Results suggest that the correlation is quite low at 0.1837253 - meaning that there is almost no association between the two conditions.

Then I created another scatterplot to find out how persons with obesity and limited access to healthy foods correlate. Results show that the correlation is very low at 0.09174848 - meaning that there is no association between the two factors.

Lastly, I created two bargraphs with a modified dataset displaying the percentage of people who reported having diabetics and obesity in the Kings county and Queens county. The graphs suggest that the percentage of people reporting to have diabetics is higher in Kings county than in Queens county, while the percentage of obese people in Kings county is lower than in Queens County. Therefore, it seems that there is no statistically significant relation between diabetics and obesity in these two counties.