# Many students postpone ingesting their dataset into R Studio. The sooner you load your dataset into R Studio and start playing with it, the more time you will have to explore your dataset and use statistical methods to develop insights.
# Start a new markdown or quarto document.
# Load library(tidyverse) and any other libraries you might need.
# Load the dataset.
# Include subtitles in between each chunk.
# Include comments to provide details about what you intend each chunk to do.
# Include at least 3 different plots (histograms, line graphs, bar graphs, scatterplots, etc.) using either base R or ggplot.
# Knit your document and publish it either as html, Word, or pdf formatFinal Project Markdown/Quarto
Load Libraries/Set Directory
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
setwd("/Users/blossomanyanwu/Documents/MATH 217 HM")Load in Data sets
community<-read_csv("finalmerge.csv")New names:
Rows: 1406 Columns: 16
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(9): State, Census Tract, copdrates, 95% Confidence Interval, Confidence... dbl
(6): ...1, StateFIPS, CensusTract, Year, Number, parkdistancepopulation lgl
(1): ...11
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
walkability<- read_csv("marylandwalk.csv")New names:
Rows: 3926 Columns: 118
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(2): CSA_Name, CBSA_Name dbl (116): ...1, OBJECTID, GEOID10, GEOID20, STATEFP,
COUNTYFP, TRACTCE, BLK...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
Clean Datasets
Use Gsub to remove percent values
Visualization 1 (Walkability Data)
# This scatter plot shows the relation ship between the amount of working age people within a census tract (P_WrkAge) and the Walkability Index of a census tract
ggplot(walkability, aes(x = P_WrkAge, y = NatWalkInd)) +
geom_point(alpha = 0.5) +
labs(x = "Workage", y = "Walkability Score") +
theme_minimal()Visual 2: Walkability Index Scores versus Number of Population that is working age
# Pct_AO2p
ggplot(walkability, aes(x = Pct_AO2p, y = NatWalkInd)) +
geom_point(alpha = 0.5) +
labs(x = "Workage", y = "Walkability Score") +
theme_minimal()Visual 3: Proportion of population earning less than 1250 monthly and Walkability Index
ggplot(walkability, aes(x = E_LowWageWk, y = NatWalkInd)) +
geom_point(alpha = 0.5) +
labs(x = "Workage", y = "Walkability Score") +
theme_minimal()Preliminary Linear Model
model2 <- lm(NatWalkInd ~ E_LowWageWk, data = walkability) # y ~ x represents dependent variable ~ independent
summary(model2)
Call:
lm(formula = NatWalkInd ~ E_LowWageWk, data = walkability)
Residuals:
Min 1Q Median 3Q Max
-11.778 -3.867 0.559 3.381 8.895
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.010e+01 7.240e-02 139.54 <2e-16 ***
E_LowWageWk 2.737e-03 2.171e-04 12.61 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.123 on 3924 degrees of freedom
Multiple R-squared: 0.03894, Adjusted R-squared: 0.03869
F-statistic: 159 on 1 and 3924 DF, p-value: < 2.2e-16
Histogram
ggplot(walkability, aes(x = R_MedWageWk)) +
geom_histogram(binwidth = 2, fill = "red", color = "black", alpha = 0.7) +
labs(title = "Income Distributions (less that 3300 but more than 1250 USD",
y = "Frequency") +
theme_minimal()Subset MoCo
moco_walkability <- walkability %>%
filter(COUNTYFP == 3)ggplot(moco_walkability, aes(x = R_MedWageWk, y = NatWalkInd)) +
geom_point(alpha = 0.5) +
labs(x = "People earning above poverty line", y = "Walkability Score") +
theme_minimal()library(DataExplorer)
plot_correlation(community)6 features with more than 20 categories ignored!
Census.Tract: 1406 categories
copdrates: 106 categories
X95..Confidence.Interval: 677 categories
Confidence.Interval.Low: 96 categories
Confidence.Interval.High: 118 categories
oldhousing: 1164 categories
Warning in cor(x = structure(list(...1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, : the
standard deviation is zero
Warning: Removed 80 rows containing missing values (`geom_text()`).
Visual 4: Healthcare Jobs/Walkability
E8_Hlth10
ggplot(moco_walkability, aes(x = Pct_AO2p, y = NatWalkInd)) +
geom_point(alpha = 0.5) +
labs(x = "% of 2+ Car Owning Homes", y = "Walkability Score") +
# stat_ellipse()
# geom_smooth()
theme_minimal()Visual 5
ggplot(community, aes(x = community$parkdistancepopulation, y = community$Number)) +
geom_bar(stat = "identity") +
labs(title = "Number of People living Near Park per County",
x = "Counties",
y = "Frequenct") +
theme_classic() Warning: Removed 133 rows containing missing values (`position_stack()`).
`