head(census_sample,5)
## # A tibble: 5 × 12
## ...1 cbg_id median_income median_age white_ppl black_ppl asian_ppl
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 10010201001 NA 34.1 0.785 0.215 0
## 2 2 10010201002 77813 41.8 0.856 0.0822 0.00711
## 3 3 10010202001 25179 38.2 0.376 0.592 0
## 4 4 10010202002 45104 39.7 0.498 0.462 0.0194
## 5 5 10010203001 55222 34.9 0.626 0.218 0.0114
## # ℹ 5 more variables: hispanic_ppl <dbl>, edu_bachelors <dbl>,
## # edu_masters <dbl>, edu_college <dbl>, edu_phd <dbl>
str(census_sample)
## spc_tbl_ [20,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:20000] 1 2 3 4 5 6 7 8 9 10 ...
## $ cbg_id : num [1:20000] 1e+10 1e+10 1e+10 1e+10 1e+10 ...
## $ median_income: num [1:20000] NA 77813 25179 45104 55222 ...
## $ median_age : num [1:20000] 34.1 41.8 38.2 39.7 34.9 49.1 58.1 37 41.5 34.4 ...
## $ white_ppl : num [1:20000] 0.785 0.856 0.376 0.498 0.626 ...
## $ black_ppl : num [1:20000] 0.2148 0.0822 0.5917 0.462 0.2179 ...
## $ asian_ppl : num [1:20000] 0 0.00711 0 0.01942 0.01142 ...
## $ hispanic_ppl : num [1:20000] 0.0215 0.0292 0.0135 0.0121 0.0964 ...
## $ edu_bachelors: num [1:20000] 0.1266 0.1893 0.0897 0.1312 0.1244 ...
## $ edu_masters : num [1:20000] 0.0485 0.1432 0.0116 0.0453 0.0807 ...
## $ edu_college : num [1:20000] 0.01477 0.02184 0.01495 0.01045 0.00607 ...
## $ edu_phd : num [1:20000] 0.03376 0.01092 0 0.00697 0.01092 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. cbg_id = col_double(),
## .. median_income = col_double(),
## .. median_age = col_double(),
## .. white_ppl = col_double(),
## .. black_ppl = col_double(),
## .. asian_ppl = col_double(),
## .. hispanic_ppl = col_double(),
## .. edu_bachelors = col_double(),
## .. edu_masters = col_double(),
## .. edu_college = col_double(),
## .. edu_phd = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
census_sample_clean <-na.omit(census_sample)
sample <- census_sample_clean[1:1000,]
sample$edu_degree <- with(sample,
edu_bachelors+edu_masters+edu_college+edu_phd)
sample <- sample %>%
mutate(cbg_id = as.factor(cbg_id)) %>%
select(-c(1,edu_college, edu_bachelors, edu_masters,edu_phd))
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
This pairwise relantionships shows the correlations between the variables. Correlations with *** indicates a high correlation between variables. For example Black ppl and white ppl have a correlation of -0.975*** indicating that when one is high the other is low.
This correlation plot shows a relationship between my numerical variables the numbers in each cell indicates the correlation between the variables ranging from -1 (indicating a negative relationship - when one increases, the other decreases) to 1 (when one increases the other also increases). 0 indicates no relationship between variables. The collor scale follows the numeric scale. Strong blue for negative correlation, strong red for positive correlation.
The data shows several interesting patterns:
The bar plot highlights differences in racial composition across the dataset. Some racial groups, like white people, appear significantly larger in proportion compared to others.
This disproportion may reflect demographic trends in the sampled population, or lead to wrong information about the less represented races .