So we’re having population and death data of lung cancer from each age group by ROC era per observation year and district.
library(dplyr);library(tidyr);library(readxl)
data <- readxl::read_excel("C:\\Users\\gilli\\Downloads\\m155.xls")
head(data)
## # A tibble: 6 × 5
## agegp pop year address death
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Y00_04 109317 61 1 1
## 2 Y05_09 116574 61 1 NA
## 3 Y10_14 114398 61 1 NA
## 4 Y15_19 102715 61 1 NA
## 5 Y20_24 85540 61 1 2
## 6 Y25_29 63738 61 1 1
Then we replace the NA death with 0, and group every 5 observation
year as a new column called yeargp.
data[is.na(data)] <- 0
data$age <-as.numeric( substr(data$agegp,start = 2,stop =3))
data$yeargp[data$year <= 65] <- 'æ°‘61-65å¹´'
data$yeargp[data$year > 65 & data$year <= 70] <- 'æ°‘66-70å¹´'
data$yeargp[data$year > 70 & data$year <= 75] <- 'æ°‘71-75å¹´'
data$yeargp[data$year > 75 & data$year <= 80] <- 'æ°‘76-80å¹´'
data$yeargp[data$year > 80 & data$year <= 85] <- 'æ°‘81-85å¹´'
data$yeargp[data$year > 85 & data$year <= 90] <- 'æ°‘86-90å¹´'
Since we’re comparing the mortality from different age group, let’s break the data into the elders and the youngs.
elder <- data %>% filter(age>=50)%>% arrange(yeargp, agegp)
young <- data %>% filter(age<= 14)%>% arrange(yeargp, agegp)
We’ll start to calculate the mortality of every 5 year, so we have to sum the death and population during the time interval over different address. After that we can calculate the mortality rate.
elder_cu_age_specific <- elder %>%
group_by(yeargp, agegp) %>%
summarise(sum_death = sum(death),
sum_pop = sum(pop)) %>%
mutate(rate = sum_death/sum_pop, rate10 = rate*100000)
young_cu_age_specific <- young %>%
group_by(yeargp, agegp) %>%
summarise(sum_death = sum(death),
sum_pop = sum(pop)) %>%
mutate(rate = sum_death/sum_pop, rate10 = rate*100000)
elder_long <- elder_cu_age_specific %>%
mutate(yeargp = factor(yeargp), agegp = factor(agegp)) %>%
select(agegp, yeargp, rate10)
elder_apc <- pivot_wider(data = elder_long,names_from = yeargp, values_from = rate10)
young_long <- young_cu_age_specific %>%
mutate(yeargp = factor(yeargp), agegp = factor(agegp)) %>%
select(agegp, yeargp, rate10)
young_apc <- pivot_wider(data = young_long,names_from = yeargp, values_from = rate10)
So let’s see the aggregated data.
print(elder_apc)
## # A tibble: 8 × 7
## agegp `æ°‘61-65å¹´` `æ°‘66-70å¹´` `æ°‘71-75å¹´` `æ°‘76-80å¹´` `æ°‘81-85å¹´` `æ°‘86-90å¹´`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Y50_54 60.7 68.4 72.7 75.4 83.9 81.5
## 2 Y55_59 84.2 85.8 97.6 100. 123. 121.
## 3 Y60_64 98.6 106. 117. 120. 153. 164.
## 4 Y65_69 118. 132. 142. 137. 170. 187.
## 5 Y70_74 126. 130. 154. 172. 204. 199.
## 6 Y75_79 122. 155. 184. 189. 238. 239.
## 7 Y80_84 137. 147. 161. 171. 252. 261.
## 8 Y85ABO 104. 135. 236. 210. 246. 254.
print(young_apc)
## # A tibble: 3 × 7
## agegp `æ°‘61-65å¹´` `æ°‘66-70å¹´` `æ°‘71-75å¹´` `æ°‘76-80å¹´` `æ°‘81-85å¹´` `æ°‘86-90å¹´`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Y00_04 0.578 0.504 0.623 0.517 0.359 0.201
## 2 Y05_09 0.621 0.904 1.05 0.756 0.280 0.142
## 3 Y10_14 0.721 1.04 1.12 0.800 0.659 0.352
You can stop here and do it on hand for drawing apc in excel.
write.csv(elder_apc, "C:\\Users\\gilli\\Downloads\\New folder\\elder_apc.csv" )
write.csv(young_apc, "C:\\Users\\gilli\\Downloads\\New folder\\young_apc.csv")
library(ggplot2)
par(mfrow = c(1,2))
ggplot(data = elder_long, aes(x = yeargp, y = rate10, col = agegp, group = agegp)) +
geom_point() + geom_line() + theme_bw()
ggplot(data = young_long, aes(x = yeargp, y = rate10, col = agegp, group = agegp)) +
geom_point() + geom_line() + theme_bw()
par(mfrow = c(1,2))
ggplot(data = elder_long, aes(x = agegp, y = rate10, col = yeargp, group = yeargp)) +
geom_point() + geom_line() + theme_bw()
ggplot(data = young_long, aes(x = agegp, y = rate10, col = yeargp, group = yeargp)) +
geom_point() + geom_line() + theme_bw()
First, let’s look at the group of people born earlier. Each line represents a cohort. We can see that the overall trend is moving from the bottom left to the top right, indicating that as these cohorts ages, the mortality rate increases. As we zoom in, we can see that the curves are shifting slightly vertically. This suggests that in these decades, there is a trend of increasing mortality rates
elder_apc %>%
ungroup() %>%
mutate(row = row_number()) %>%
gather(year, mortality, `æ°‘61-65å¹´`:`æ°‘86-90å¹´`) %>%
arrange(row, desc(year)) %>%
group_by(row) %>%
mutate(time_left = row_number()) %>%
transmute(agegp, year, mortality, start_year = time_left + row - 1)%>%
ggplot(aes(agegp, mortality, color = factor(start_year), group = factor(start_year))) +
geom_line() + ggtitle("cohort effect of the >50 population") + theme_bw()
Now let’s look at the younger generation, those who have been vaccinated. First, let’s look at the y-axis. The mortality rate per hundred thousand population is much lower compared to the previous axis. This is because they are younger. We cannot see the overall trend, however, for the slope of each cohort, the trend shifts from positive to negative. This means that the mortality rate decreases with each generation.
young_apc %>%
ungroup() %>%
mutate(row = row_number()) %>%
gather(year, mortality, `æ°‘61-65å¹´`:`æ°‘86-90å¹´`) %>%
arrange(row, desc(year)) %>%
group_by(row) %>%
mutate(time_left = row_number()) %>%
transmute(agegp, year, mortality, start_year = time_left + row - 1)%>%
ggplot(aes(agegp, mortality, color = factor(start_year), group = factor(start_year))) +
geom_line() + ggtitle("cohort effect of the <14 population") + theme_bw()
We can attribute the slope change on mortality on the intervention of
HBV vaccine in Taiwan, since we’re only observing the cohort with lower
age, which means of less exposure to develop kung cancer, and
experiencing the HBV vaccine policy.