Data Preparation

library(tidyverse)
library(kableExtra)
url="https://raw.githubusercontent.com/nnaemeka-git/global-datasets/main/cancer_reg.csv"
cancer_tib <- read.csv(file=url, sep=",")
glimpse(cancer_tib)
## Rows: 3,047
## Columns: 34
## $ avgAnnCount             <dbl> 1397, 173, 102, 427, 57, 428, 250, 146, 88, 40~
## $ avgDeathsPerYear        <int> 469, 70, 50, 202, 26, 152, 97, 71, 36, 1380, 3~
## $ TARGET_deathRate        <dbl> 164.9, 161.3, 174.7, 194.8, 144.4, 176.0, 175.~
## $ incidenceRate           <dbl> 489.8, 411.6, 349.7, 430.4, 350.1, 505.4, 461.~
## $ medIncome               <int> 61898, 48127, 49348, 44243, 49955, 52313, 3778~
## $ popEst2015              <int> 260131, 43269, 21026, 75882, 10321, 61023, 415~
## $ povertyPercent          <dbl> 11.2, 18.6, 14.6, 17.1, 12.5, 15.6, 23.2, 17.8~
## $ studyPerCap             <dbl> 499.74820, 23.11123, 47.56016, 342.63725, 0.00~
## $ binnedInc               <chr> "(61494.5, 125635]", "(48021.6, 51046.4]", "(4~
## $ MedianAge               <dbl> 39.3, 33.0, 45.0, 42.8, 48.3, 45.4, 42.6, 51.7~
## $ MedianAgeMale           <dbl> 36.9, 32.2, 44.0, 42.2, 47.8, 43.5, 42.2, 50.8~
## $ MedianAgeFemale         <dbl> 41.7, 33.7, 45.8, 43.4, 48.9, 48.0, 43.5, 52.5~
## $ Geography               <chr> "Kitsap County, Washington", "Kittitas County,~
## $ AvgHouseholdSize        <dbl> 2.5400, 2.3400, 2.6200, 2.5200, 2.3400, 2.5800~
## $ PercentMarried          <dbl> 52.5, 44.5, 54.2, 52.7, 57.8, 50.4, 54.1, 52.7~
## $ PctNoHS18_24            <dbl> 11.5, 6.1, 24.0, 20.2, 14.9, 29.9, 26.1, 27.3,~
## $ PctHS18_24              <dbl> 39.5, 22.4, 36.6, 41.2, 43.0, 35.1, 41.4, 33.9~
## $ PctSomeCol18_24         <dbl> 42.1, 64.0, NA, 36.1, 40.0, NA, NA, 36.5, NA, ~
## $ PctBachDeg18_24         <dbl> 6.9, 7.5, 9.5, 2.5, 2.0, 4.5, 5.8, 2.2, 1.4, 7~
## $ PctHS25_Over            <dbl> 23.2, 26.0, 29.0, 31.6, 33.4, 30.4, 29.8, 31.6~
## $ PctBachDeg25_Over       <dbl> 19.6, 22.7, 16.0, 9.3, 15.0, 11.9, 11.9, 11.3,~
## $ PctEmployed16_Over      <dbl> 51.9, 55.9, 45.9, 48.3, 48.2, 44.1, 51.8, 40.9~
## $ PctUnemployed16_Over    <dbl> 8.0, 7.8, 7.0, 12.1, 4.8, 12.9, 8.9, 8.9, 10.3~
## $ PctPrivateCoverage      <dbl> 75.1, 70.2, 63.7, 58.4, 61.6, 60.0, 49.5, 55.8~
## $ PctPrivateCoverageAlone <dbl> NA, 53.8, 43.5, 40.3, 43.9, 38.8, 35.0, 33.1, ~
## $ PctEmpPrivCoverage      <dbl> 41.6, 43.6, 34.9, 35.0, 35.1, 32.6, 28.3, 25.9~
## $ PctPublicCoverage       <dbl> 32.9, 31.1, 42.1, 45.3, 44.0, 43.2, 46.4, 50.9~
## $ PctPublicCoverageAlone  <dbl> 14.0, 15.3, 21.1, 25.0, 22.7, 20.2, 28.7, 24.1~
## $ PctWhite                <dbl> 81.78053, 89.22851, 90.92219, 91.74469, 94.104~
## $ PctBlack                <dbl> 2.5947283, 0.9691025, 0.7396734, 0.7826260, 0.~
## $ PctAsian                <dbl> 4.82185710, 2.24623259, 0.46589818, 1.16135867~
## $ PctOtherRace            <dbl> 1.84347853, 3.74135153, 2.74735831, 1.36264318~
## $ PctMarriedHouseholds    <dbl> 52.85608, 45.37250, 54.44487, 51.02151, 54.027~
## $ BirthRate               <dbl> 6.1188310, 4.3330956, 3.7294878, 4.6038408, 6.~
dim(cancer_tib)
## [1] 3047   34
kbl(head(cancer_tib))%>%kable_styling() %>% kable_paper("hover", full_width = F)
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge MedianAgeMale MedianAgeFemale Geography AvgHouseholdSize PercentMarried PctNoHS18_24 PctHS18_24 PctSomeCol18_24 PctBachDeg18_24 PctHS25_Over PctBachDeg25_Over PctEmployed16_Over PctUnemployed16_Over PctPrivateCoverage PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
1397 469 164.9 489.8 61898 260131 11.2 499.74820 (61494.5, 125635] 39.3 36.9 41.7 Kitsap County, Washington 2.54 52.5 11.5 39.5 42.1 6.9 23.2 19.6 51.9 8.0 75.1 NA 41.6 32.9 14.0 81.78053 2.5947283 4.8218571 1.8434785 52.85608 6.118831
173 70 161.3 411.6 48127 43269 18.6 23.11123 (48021.6, 51046.4] 33.0 32.2 33.7 Kittitas County, Washington 2.34 44.5 6.1 22.4 64.0 7.5 26.0 22.7 55.9 7.8 70.2 53.8 43.6 31.1 15.3 89.22851 0.9691025 2.2462326 3.7413515 45.37250 4.333096
102 50 174.7 349.7 49348 21026 14.6 47.56016 (48021.6, 51046.4] 45.0 44.0 45.8 Klickitat County, Washington 2.62 54.2 24.0 36.6 NA 9.5 29.0 16.0 45.9 7.0 63.7 43.5 34.9 42.1 21.1 90.92219 0.7396734 0.4658982 2.7473583 54.44487 3.729488
427 202 194.8 430.4 44243 75882 17.1 342.63725 (42724.4, 45201] 42.8 42.2 43.4 Lewis County, Washington 2.52 52.7 20.2 41.2 36.1 2.5 31.6 9.3 48.3 12.1 58.4 40.3 35.0 45.3 25.0 91.74469 0.7826260 1.1613587 1.3626432 51.02151 4.603841
57 26 144.4 350.1 49955 10321 12.5 0.00000 (48021.6, 51046.4] 48.3 47.8 48.9 Lincoln County, Washington 2.34 57.8 14.9 43.0 40.0 2.0 33.4 15.0 48.2 4.8 61.6 43.9 35.1 44.0 22.7 94.10402 0.2701920 0.6658304 0.4921355 54.02746 6.796657
428 152 176.0 505.4 52313 61023 15.6 180.25990 (51046.4, 54545.6] 45.4 43.5 48.0 Mason County, Washington 2.58 50.4 29.9 35.1 NA 4.5 30.4 11.9 44.1 12.9 60.0 38.8 32.6 43.2 20.2 84.88263 1.6532052 1.5380566 3.3146354 51.22036 4.964476

Research question

Cases

There are 3047 observations in the dataset and 34 variables. Each observation represents a selected county in the United states of America.

Data collection

These data were aggregated from a number of sources including the American Community Survey (census.gov), clinicaltrials.gov, and cancer.gov

Type of study

This is an observational study.

Data Source

The data was sourced from https://data.world/nrippner/ols-regression-challenge

Response

TARGET_deathRate: Dependent variable. Mean per capita (100,000) cancer mortalities and is numerical

Explanatory

The explanatory variables are as follows:

Relevant summary statistics

kbl(summary(cancer_tib))%>%kable_styling() %>% kable_paper("hover", full_width = F)
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge MedianAgeMale MedianAgeFemale Geography AvgHouseholdSize PercentMarried PctNoHS18_24 PctHS18_24 PctSomeCol18_24 PctBachDeg18_24 PctHS25_Over PctBachDeg25_Over PctEmployed16_Over PctUnemployed16_Over PctPrivateCoverage PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
Min. : 6.0 Min. : 3 Min. : 59.7 Min. : 201.3 Min. : 22640 Min. : 827 Min. : 3.20 Min. : 0.00 Length:3047 Min. : 22.30 Min. :22.40 Min. :22.30 Length:3047 Min. :0.0221 Min. :23.10 Min. : 0.00 Min. : 0.0 Min. : 7.10 Min. : 0.000 Min. : 7.50 Min. : 2.50 Min. :17.60 Min. : 0.400 Min. :22.30 Min. :15.70 Min. :13.5 Min. :11.20 Min. : 2.60 Min. : 10.20 Min. : 0.0000 Min. : 0.0000 Min. : 0.0000 Min. :22.99 Min. : 0.000
1st Qu.: 76.0 1st Qu.: 28 1st Qu.:161.2 1st Qu.: 420.3 1st Qu.: 38883 1st Qu.: 11684 1st Qu.:12.15 1st Qu.: 0.00 Class :character 1st Qu.: 37.70 1st Qu.:36.35 1st Qu.:39.10 Class :character 1st Qu.:2.3700 1st Qu.:47.75 1st Qu.:12.80 1st Qu.:29.2 1st Qu.:34.00 1st Qu.: 3.100 1st Qu.:30.40 1st Qu.: 9.40 1st Qu.:48.60 1st Qu.: 5.500 1st Qu.:57.20 1st Qu.:41.00 1st Qu.:34.5 1st Qu.:30.90 1st Qu.:14.85 1st Qu.: 77.30 1st Qu.: 0.6207 1st Qu.: 0.2542 1st Qu.: 0.2952 1st Qu.:47.76 1st Qu.: 4.521
Median : 171.0 Median : 61 Median :178.1 Median : 453.5 Median : 45207 Median : 26643 Median :15.90 Median : 0.00 Mode :character Median : 41.00 Median :39.60 Median :42.40 Mode :character Median :2.5000 Median :52.40 Median :17.10 Median :34.7 Median :40.40 Median : 5.400 Median :35.30 Median :12.30 Median :54.50 Median : 7.600 Median :65.10 Median :48.70 Median :41.1 Median :36.30 Median :18.80 Median : 90.06 Median : 2.2476 Median : 0.5498 Median : 0.8262 Median :51.67 Median : 5.381
Mean : 606.3 Mean : 186 Mean :178.7 Mean : 448.3 Mean : 47063 Mean : 102637 Mean :16.88 Mean : 155.40 NA Mean : 45.27 Mean :39.57 Mean :42.15 NA Mean :2.4797 Mean :51.77 Mean :18.22 Mean :35.0 Mean :40.98 Mean : 6.158 Mean :34.80 Mean :13.28 Mean :54.15 Mean : 7.852 Mean :64.35 Mean :48.45 Mean :41.2 Mean :36.25 Mean :19.24 Mean : 83.65 Mean : 9.1080 Mean : 1.2540 Mean : 1.9835 Mean :51.24 Mean : 5.640
3rd Qu.: 518.0 3rd Qu.: 149 3rd Qu.:195.2 3rd Qu.: 480.9 3rd Qu.: 52492 3rd Qu.: 68671 3rd Qu.:20.40 3rd Qu.: 83.65 NA 3rd Qu.: 44.00 3rd Qu.:42.50 3rd Qu.:45.30 NA 3rd Qu.:2.6300 3rd Qu.:56.40 3rd Qu.:22.70 3rd Qu.:40.7 3rd Qu.:46.40 3rd Qu.: 8.200 3rd Qu.:39.65 3rd Qu.:16.10 3rd Qu.:60.30 3rd Qu.: 9.700 3rd Qu.:72.10 3rd Qu.:55.60 3rd Qu.:47.7 3rd Qu.:41.55 3rd Qu.:23.10 3rd Qu.: 95.45 3rd Qu.:10.5097 3rd Qu.: 1.2210 3rd Qu.: 2.1780 3rd Qu.:55.40 3rd Qu.: 6.494
Max. :38150.0 Max. :14010 Max. :362.8 Max. :1206.9 Max. :125635 Max. :10170292 Max. :47.40 Max. :9762.31 NA Max. :624.00 Max. :64.70 Max. :65.70 NA Max. :3.9700 Max. :72.50 Max. :64.10 Max. :72.5 Max. :79.00 Max. :51.800 Max. :54.80 Max. :42.20 Max. :80.10 Max. :29.400 Max. :92.30 Max. :78.90 Max. :70.7 Max. :65.10 Max. :46.60 Max. :100.00 Max. :85.9478 Max. :42.6194 Max. :41.9303 Max. :78.08 Max. :21.326
NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA’s :2285 NA NA NA NA’s :152 NA NA NA’s :609 NA NA NA NA NA NA NA NA NA
cancer_tib %>% ggplot(aes(incidenceRate,TARGET_deathRate)) +
  geom_point()+labs(title="Target Death Rate vs Incident Rate")

cancer_tib %>% ggplot(aes(TARGET_deathRate)) +
  geom_histogram()+labs(title="Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cancer_tib %>% ggplot(aes(medIncome,TARGET_deathRate)) +
  geom_point()+labs(title="Target Death Rate vs Median Income")

cancer_tib %>% ggplot(aes(TARGET_deathRate)) +
  geom_boxplot()+labs(title="View outliers in Target Death Rate")