library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library("scales")
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggsci)
library(treemap)
library("ggplot2")
This dataset, originally in xlsx format, was converted to a CSV format and then read directly into R.
setwd("~/Data110")
cancer <- read_csv("Cancerdataset.csv")
## Rows: 1254 Columns: 10
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Cancer Site, Sex, Age, Incidence and Survival Assumptions, Annual C...
## dbl (5): Year, Total Costs, Initial Year After Diagnosis Cost, Continuing Ph...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(cancer) <- tolower(names(cancer))
names(cancer) <- gsub(" ","",names(cancer))
str(cancer)
## spec_tbl_df [1,254 x 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ cancersite : chr [1:1254] "AllSites" "AllSites" "AllSites" "AllSites" ...
## $ year : num [1:1254] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ sex : chr [1:1254] "Both sexes" "Both sexes" "Both sexes" "Both sexes" ...
## $ age : chr [1:1254] "All ages" "All ages" "All ages" "All ages" ...
## $ incidenceandsurvivalassumptions: chr [1:1254] "Incidence, Survival at constant rate" "Incidence follows recent trend, Survival constant" "Survival follows recent trend, Incidence constant" "Incidence, Survival follow recent trends" ...
## $ annualcostincrease : chr [1:1254] "0%" "0%" "0%" "0%" ...
## $ totalcosts : num [1:1254] 124566 122421 125398 123236 123236 ...
## $ initialyearafterdiagnosiscost : num [1:1254] 40464 38553 40464 38553 38553 ...
## $ continuingphasecost : num [1:1254] 46643 46672 47136 47156 47156 ...
## $ lastyearoflifecost : num [1:1254] 37459 37196 37798 37528 37528 ...
## - attr(*, "spec")=
## .. cols(
## .. `Cancer Site` = col_character(),
## .. Year = col_double(),
## .. Sex = col_character(),
## .. Age = col_character(),
## .. `Incidence and Survival Assumptions` = col_character(),
## .. `Annual Cost Increase` = col_character(),
## .. `Total Costs` = col_double(),
## .. `Initial Year After Diagnosis Cost` = col_double(),
## .. `Continuing Phase Cost` = col_double(),
## .. `Last Year of Life Cost` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
graph10 <- cancer %>%
filter(cancersite!="AllSites")#%>%
#group_by(cancersite, year)%>%
#summarise(mytot= mean(sum(totalcosts)))
graph11 <- graph10 %>%
group_by(cancersite, year) %>%
summarise(mytot=sum(totalcosts))
## `summarise()` has grouped output by 'cancersite'. You can override using the `.groups` argument.
graph12 <- graph11 %>%
group_by(cancersite)%>%
summarise(mymean=mean(mytot))
graph12%>%
ggplot(aes(x=reorder(cancersite, mymean), y=mymean, fill=cancersite))+
geom_bar(stat="identity")+
coord_flip()+
theme_bw()+
scale_fill_manual(values = c("Other"="#a61c00", "Breast"="#d9d9d9",
"Colorectal"="#ae9999", "Lymphoma"="#4a86e8",
"Prostate"="#ff9900", "Lung"="#6aa84f",
"Leukemia"="#c27ba0", "Brain"="#38761d",
"Ovary"="#000000", "Kidney"="#8e7cc3",
"Bladder"= "#76a5af", "Head_Neck"="#fff2cc",
"Melanoma" = "#ff0000", "Uterus"="#ffce30",
"Pancreas"="#d9aed3", "Stomach"="#ff00ff",
"Esophagus"="#7f6000", "Cervix"="#666666"))+
#scale_x_continuous(labels = comma)+
theme(legend.position = "None")+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
labs(x="Cancer Site", y="Average Cost", title= "Average cost of Cancer")
treemap(graph12,index = "cancersite", vSize = "mymean",
type ="value",vColor = "mymean", palette="RdYlBu", title = "Treemap by Cancer Site", title.legend = "Average Cost of Cancer")
graph13 <- cancer %>%
filter(cancersite=="AllSites")%>%
group_by(cancersite, year)%>%
summarise(mymean3=mean(totalcosts))
## `summarise()` has grouped output by 'cancersite'. You can override using the `.groups` argument.
Let’s see how the cost of cancer treatment has changed for the past 10 years
graph13%>%
ggplot(aes(year, mymean3, colour=-mymean3))+
#theme(legend.position = "None")+
geom_point(size=5)+
geom_line(color="blue")+
scale_x_continuous(limits = c(2010, 2021))+
theme_bw()+
theme(legend.position = "None")+
labs(x="Year", y="Average Cost of cancer",
title= "Average cost of Cancer - 2010 - 2020")
The line graph illustrates the average cost of cancer treatment in the US for the past 10 years.
In this dataset, downloaded from data.world, https://data.world/abunday/cancer-data, cost is divided in three categories: initial year, continuing phase and last year of life. This line graph is about the average of the total of those 3 variables per year. There are five other variables, “Cancer site”, i.e., the location of the cancer within the body, year, sex, age and “incidence and survival assumptions”. If we look at the trends, we can see how treatment cost is on the rise.
Notice in 2010, the overall average cost of treatment was below 125k compared to over 160k last year. Considering the average US savings account balance to be less than $6,000.00, patients seeking treatment, could easily spend their life savings and accumulate some serious debts in the process.
source: www.valuepenguin.com/banking/average-savings-account-balance