Data Preparation
library(tidyverse)
library(kableExtra)
="https://raw.githubusercontent.com/nnaemeka-git/global-datasets/main/cancer_reg.csv"
url<- read.csv(file=url, sep=",")
cancer_tib glimpse(cancer_tib)
## Rows: 3,047
## Columns: 34
## $ avgAnnCount <dbl> 1397, 173, 102, 427, 57, 428, 250, 146, 88, 40~
## $ avgDeathsPerYear <int> 469, 70, 50, 202, 26, 152, 97, 71, 36, 1380, 3~
## $ TARGET_deathRate <dbl> 164.9, 161.3, 174.7, 194.8, 144.4, 176.0, 175.~
## $ incidenceRate <dbl> 489.8, 411.6, 349.7, 430.4, 350.1, 505.4, 461.~
## $ medIncome <int> 61898, 48127, 49348, 44243, 49955, 52313, 3778~
## $ popEst2015 <int> 260131, 43269, 21026, 75882, 10321, 61023, 415~
## $ povertyPercent <dbl> 11.2, 18.6, 14.6, 17.1, 12.5, 15.6, 23.2, 17.8~
## $ studyPerCap <dbl> 499.74820, 23.11123, 47.56016, 342.63725, 0.00~
## $ binnedInc <chr> "(61494.5, 125635]", "(48021.6, 51046.4]", "(4~
## $ MedianAge <dbl> 39.3, 33.0, 45.0, 42.8, 48.3, 45.4, 42.6, 51.7~
## $ MedianAgeMale <dbl> 36.9, 32.2, 44.0, 42.2, 47.8, 43.5, 42.2, 50.8~
## $ MedianAgeFemale <dbl> 41.7, 33.7, 45.8, 43.4, 48.9, 48.0, 43.5, 52.5~
## $ Geography <chr> "Kitsap County, Washington", "Kittitas County,~
## $ AvgHouseholdSize <dbl> 2.5400, 2.3400, 2.6200, 2.5200, 2.3400, 2.5800~
## $ PercentMarried <dbl> 52.5, 44.5, 54.2, 52.7, 57.8, 50.4, 54.1, 52.7~
## $ PctNoHS18_24 <dbl> 11.5, 6.1, 24.0, 20.2, 14.9, 29.9, 26.1, 27.3,~
## $ PctHS18_24 <dbl> 39.5, 22.4, 36.6, 41.2, 43.0, 35.1, 41.4, 33.9~
## $ PctSomeCol18_24 <dbl> 42.1, 64.0, NA, 36.1, 40.0, NA, NA, 36.5, NA, ~
## $ PctBachDeg18_24 <dbl> 6.9, 7.5, 9.5, 2.5, 2.0, 4.5, 5.8, 2.2, 1.4, 7~
## $ PctHS25_Over <dbl> 23.2, 26.0, 29.0, 31.6, 33.4, 30.4, 29.8, 31.6~
## $ PctBachDeg25_Over <dbl> 19.6, 22.7, 16.0, 9.3, 15.0, 11.9, 11.9, 11.3,~
## $ PctEmployed16_Over <dbl> 51.9, 55.9, 45.9, 48.3, 48.2, 44.1, 51.8, 40.9~
## $ PctUnemployed16_Over <dbl> 8.0, 7.8, 7.0, 12.1, 4.8, 12.9, 8.9, 8.9, 10.3~
## $ PctPrivateCoverage <dbl> 75.1, 70.2, 63.7, 58.4, 61.6, 60.0, 49.5, 55.8~
## $ PctPrivateCoverageAlone <dbl> NA, 53.8, 43.5, 40.3, 43.9, 38.8, 35.0, 33.1, ~
## $ PctEmpPrivCoverage <dbl> 41.6, 43.6, 34.9, 35.0, 35.1, 32.6, 28.3, 25.9~
## $ PctPublicCoverage <dbl> 32.9, 31.1, 42.1, 45.3, 44.0, 43.2, 46.4, 50.9~
## $ PctPublicCoverageAlone <dbl> 14.0, 15.3, 21.1, 25.0, 22.7, 20.2, 28.7, 24.1~
## $ PctWhite <dbl> 81.78053, 89.22851, 90.92219, 91.74469, 94.104~
## $ PctBlack <dbl> 2.5947283, 0.9691025, 0.7396734, 0.7826260, 0.~
## $ PctAsian <dbl> 4.82185710, 2.24623259, 0.46589818, 1.16135867~
## $ PctOtherRace <dbl> 1.84347853, 3.74135153, 2.74735831, 1.36264318~
## $ PctMarriedHouseholds <dbl> 52.85608, 45.37250, 54.44487, 51.02151, 54.027~
## $ BirthRate <dbl> 6.1188310, 4.3330956, 3.7294878, 4.6038408, 6.~
dim(cancer_tib)
## [1] 3047 34
kbl(head(cancer_tib))%>%kable_styling() %>% kable_paper("hover", full_width = F)
avgAnnCount | avgDeathsPerYear | TARGET_deathRate | incidenceRate | medIncome | popEst2015 | povertyPercent | studyPerCap | binnedInc | MedianAge | MedianAgeMale | MedianAgeFemale | Geography | AvgHouseholdSize | PercentMarried | PctNoHS18_24 | PctHS18_24 | PctSomeCol18_24 | PctBachDeg18_24 | PctHS25_Over | PctBachDeg25_Over | PctEmployed16_Over | PctUnemployed16_Over | PctPrivateCoverage | PctPrivateCoverageAlone | PctEmpPrivCoverage | PctPublicCoverage | PctPublicCoverageAlone | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1397 | 469 | 164.9 | 489.8 | 61898 | 260131 | 11.2 | 499.74820 | (61494.5, 125635] | 39.3 | 36.9 | 41.7 | Kitsap County, Washington | 2.54 | 52.5 | 11.5 | 39.5 | 42.1 | 6.9 | 23.2 | 19.6 | 51.9 | 8.0 | 75.1 | NA | 41.6 | 32.9 | 14.0 | 81.78053 | 2.5947283 | 4.8218571 | 1.8434785 | 52.85608 | 6.118831 |
173 | 70 | 161.3 | 411.6 | 48127 | 43269 | 18.6 | 23.11123 | (48021.6, 51046.4] | 33.0 | 32.2 | 33.7 | Kittitas County, Washington | 2.34 | 44.5 | 6.1 | 22.4 | 64.0 | 7.5 | 26.0 | 22.7 | 55.9 | 7.8 | 70.2 | 53.8 | 43.6 | 31.1 | 15.3 | 89.22851 | 0.9691025 | 2.2462326 | 3.7413515 | 45.37250 | 4.333096 |
102 | 50 | 174.7 | 349.7 | 49348 | 21026 | 14.6 | 47.56016 | (48021.6, 51046.4] | 45.0 | 44.0 | 45.8 | Klickitat County, Washington | 2.62 | 54.2 | 24.0 | 36.6 | NA | 9.5 | 29.0 | 16.0 | 45.9 | 7.0 | 63.7 | 43.5 | 34.9 | 42.1 | 21.1 | 90.92219 | 0.7396734 | 0.4658982 | 2.7473583 | 54.44487 | 3.729488 |
427 | 202 | 194.8 | 430.4 | 44243 | 75882 | 17.1 | 342.63725 | (42724.4, 45201] | 42.8 | 42.2 | 43.4 | Lewis County, Washington | 2.52 | 52.7 | 20.2 | 41.2 | 36.1 | 2.5 | 31.6 | 9.3 | 48.3 | 12.1 | 58.4 | 40.3 | 35.0 | 45.3 | 25.0 | 91.74469 | 0.7826260 | 1.1613587 | 1.3626432 | 51.02151 | 4.603841 |
57 | 26 | 144.4 | 350.1 | 49955 | 10321 | 12.5 | 0.00000 | (48021.6, 51046.4] | 48.3 | 47.8 | 48.9 | Lincoln County, Washington | 2.34 | 57.8 | 14.9 | 43.0 | 40.0 | 2.0 | 33.4 | 15.0 | 48.2 | 4.8 | 61.6 | 43.9 | 35.1 | 44.0 | 22.7 | 94.10402 | 0.2701920 | 0.6658304 | 0.4921355 | 54.02746 | 6.796657 |
428 | 152 | 176.0 | 505.4 | 52313 | 61023 | 15.6 | 180.25990 | (51046.4, 54545.6] | 45.4 | 43.5 | 48.0 | Mason County, Washington | 2.58 | 50.4 | 29.9 | 35.1 | NA | 4.5 | 30.4 | 11.9 | 44.1 | 12.9 | 60.0 | 38.8 | 32.6 | 43.2 | 20.2 | 84.88263 | 1.6532052 | 1.5380566 | 3.3146354 | 51.22036 | 4.964476 |
Research question
- What are the predictors of death rate?
Cases
There are 3047 observations in the dataset and 34 variables. Each observation represents a selected county in the United states of America.
Data collection
These data were aggregated from a number of sources including the American Community Survey (census.gov), clinicaltrials.gov, and cancer.gov
Type of study
This is an observational study.
Data Source
The data was sourced from https://data.world/nrippner/ols-regression-challenge
Response
TARGET_deathRate: Dependent variable. Mean per capita (100,000) cancer mortalities and is numerical
Explanatory
The explanatory variables are as follows:
avgAnnCount: Mean number of reported cases of cancer diagnosed annually
avgDeathsPerYear: Mean number of reported mortalities due to cancer
incidenceRate: Mean per capita (100,000) cancer diagoses
medianIncome: Median income per county
popEst2015: Population of county
povertyPercent: Percent of populace in poverty
studyPerCap: Per capita number of cancer-related clinical trials per county
binnedInc: Median income per capita binned by decile
MedianAge: Median age of county residents
MedianAgeMale: Median age of male county residents
MedianAgeFemale: Median age of female county residents
Geography: County name
AvgHouseholdSize: Mean household size of county
PercentMarried: Percent of county residents who are married
PctNoHS18_24: Percent of county residents ages 18-24 highest education attained: less than high school
PctHS18_24: Percent of county residents ages 18-24 highest education attained: high school diploma
PctSomeCol18_24: Percent of county residents ages 18-24 highest education attained: some college
PctBachDeg18_24: Percent of county residents ages 18-24 highest education attained: bachelor’s degree
PctHS25_Over: Percent of county residents ages 25 and over highest education attained: high school diploma
PctBachDeg25_Over: Percent of county residents ages 25 and over highest education attained: bachelor’s degree
PctEmployed16_Over: Percent of county residents ages 16 and over employed
PctUnemployed16_Over: Percent of county residents ages 16 and over unemployed
PctPrivateCoverage: Percent of county residents with private health coverage
PctPrivateCoverageAlone: Percent of county residents with private health coverage alone (no public assistance)
PctEmpPrivCoverage: Percent of county residents with employee-provided private health coverage
PctPublicCoverage: Percent of county residents with government-provided health coverage
PctPubliceCoverageAlone: Percent of county residents with government-provided health coverage alone
PctWhite: Percent of county residents who identify as White
PctBlack: Percent of county residents who identify as Black
PctAsian: Percent of county residents who identify as Asian
PctOtherRace: Percent of county residents who identify in a category which is not White, Black, or Asian
PctMarriedHouseholds: Percent of married households
BirthRate: Number of live births relative to number of women in county
Relevant summary statistics
kbl(summary(cancer_tib))%>%kable_styling() %>% kable_paper("hover", full_width = F)
avgAnnCount | avgDeathsPerYear | TARGET_deathRate | incidenceRate | medIncome | popEst2015 | povertyPercent | studyPerCap | binnedInc | MedianAge | MedianAgeMale | MedianAgeFemale | Geography | AvgHouseholdSize | PercentMarried | PctNoHS18_24 | PctHS18_24 | PctSomeCol18_24 | PctBachDeg18_24 | PctHS25_Over | PctBachDeg25_Over | PctEmployed16_Over | PctUnemployed16_Over | PctPrivateCoverage | PctPrivateCoverageAlone | PctEmpPrivCoverage | PctPublicCoverage | PctPublicCoverageAlone | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min. : 6.0 | Min. : 3 | Min. : 59.7 | Min. : 201.3 | Min. : 22640 | Min. : 827 | Min. : 3.20 | Min. : 0.00 | Length:3047 | Min. : 22.30 | Min. :22.40 | Min. :22.30 | Length:3047 | Min. :0.0221 | Min. :23.10 | Min. : 0.00 | Min. : 0.0 | Min. : 7.10 | Min. : 0.000 | Min. : 7.50 | Min. : 2.50 | Min. :17.60 | Min. : 0.400 | Min. :22.30 | Min. :15.70 | Min. :13.5 | Min. :11.20 | Min. : 2.60 | Min. : 10.20 | Min. : 0.0000 | Min. : 0.0000 | Min. : 0.0000 | Min. :22.99 | Min. : 0.000 | |
1st Qu.: 76.0 | 1st Qu.: 28 | 1st Qu.:161.2 | 1st Qu.: 420.3 | 1st Qu.: 38883 | 1st Qu.: 11684 | 1st Qu.:12.15 | 1st Qu.: 0.00 | Class :character | 1st Qu.: 37.70 | 1st Qu.:36.35 | 1st Qu.:39.10 | Class :character | 1st Qu.:2.3700 | 1st Qu.:47.75 | 1st Qu.:12.80 | 1st Qu.:29.2 | 1st Qu.:34.00 | 1st Qu.: 3.100 | 1st Qu.:30.40 | 1st Qu.: 9.40 | 1st Qu.:48.60 | 1st Qu.: 5.500 | 1st Qu.:57.20 | 1st Qu.:41.00 | 1st Qu.:34.5 | 1st Qu.:30.90 | 1st Qu.:14.85 | 1st Qu.: 77.30 | 1st Qu.: 0.6207 | 1st Qu.: 0.2542 | 1st Qu.: 0.2952 | 1st Qu.:47.76 | 1st Qu.: 4.521 | |
Median : 171.0 | Median : 61 | Median :178.1 | Median : 453.5 | Median : 45207 | Median : 26643 | Median :15.90 | Median : 0.00 | Mode :character | Median : 41.00 | Median :39.60 | Median :42.40 | Mode :character | Median :2.5000 | Median :52.40 | Median :17.10 | Median :34.7 | Median :40.40 | Median : 5.400 | Median :35.30 | Median :12.30 | Median :54.50 | Median : 7.600 | Median :65.10 | Median :48.70 | Median :41.1 | Median :36.30 | Median :18.80 | Median : 90.06 | Median : 2.2476 | Median : 0.5498 | Median : 0.8262 | Median :51.67 | Median : 5.381 | |
Mean : 606.3 | Mean : 186 | Mean :178.7 | Mean : 448.3 | Mean : 47063 | Mean : 102637 | Mean :16.88 | Mean : 155.40 | NA | Mean : 45.27 | Mean :39.57 | Mean :42.15 | NA | Mean :2.4797 | Mean :51.77 | Mean :18.22 | Mean :35.0 | Mean :40.98 | Mean : 6.158 | Mean :34.80 | Mean :13.28 | Mean :54.15 | Mean : 7.852 | Mean :64.35 | Mean :48.45 | Mean :41.2 | Mean :36.25 | Mean :19.24 | Mean : 83.65 | Mean : 9.1080 | Mean : 1.2540 | Mean : 1.9835 | Mean :51.24 | Mean : 5.640 | |
3rd Qu.: 518.0 | 3rd Qu.: 149 | 3rd Qu.:195.2 | 3rd Qu.: 480.9 | 3rd Qu.: 52492 | 3rd Qu.: 68671 | 3rd Qu.:20.40 | 3rd Qu.: 83.65 | NA | 3rd Qu.: 44.00 | 3rd Qu.:42.50 | 3rd Qu.:45.30 | NA | 3rd Qu.:2.6300 | 3rd Qu.:56.40 | 3rd Qu.:22.70 | 3rd Qu.:40.7 | 3rd Qu.:46.40 | 3rd Qu.: 8.200 | 3rd Qu.:39.65 | 3rd Qu.:16.10 | 3rd Qu.:60.30 | 3rd Qu.: 9.700 | 3rd Qu.:72.10 | 3rd Qu.:55.60 | 3rd Qu.:47.7 | 3rd Qu.:41.55 | 3rd Qu.:23.10 | 3rd Qu.: 95.45 | 3rd Qu.:10.5097 | 3rd Qu.: 1.2210 | 3rd Qu.: 2.1780 | 3rd Qu.:55.40 | 3rd Qu.: 6.494 | |
Max. :38150.0 | Max. :14010 | Max. :362.8 | Max. :1206.9 | Max. :125635 | Max. :10170292 | Max. :47.40 | Max. :9762.31 | NA | Max. :624.00 | Max. :64.70 | Max. :65.70 | NA | Max. :3.9700 | Max. :72.50 | Max. :64.10 | Max. :72.5 | Max. :79.00 | Max. :51.800 | Max. :54.80 | Max. :42.20 | Max. :80.10 | Max. :29.400 | Max. :92.30 | Max. :78.90 | Max. :70.7 | Max. :65.10 | Max. :46.60 | Max. :100.00 | Max. :85.9478 | Max. :42.6194 | Max. :41.9303 | Max. :78.08 | Max. :21.326 | |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :2285 | NA | NA | NA | NA’s :152 | NA | NA | NA’s :609 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
%>% ggplot(aes(incidenceRate,TARGET_deathRate)) +
cancer_tib geom_point()+labs(title="Target Death Rate vs Incident Rate")
%>% ggplot(aes(TARGET_deathRate)) +
cancer_tib geom_histogram()+labs(title="Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
%>% ggplot(aes(medIncome,TARGET_deathRate)) +
cancer_tib geom_point()+labs(title="Target Death Rate vs Median Income")
%>% ggplot(aes(TARGET_deathRate)) +
cancer_tib geom_boxplot()+labs(title="View outliers in Target Death Rate")