This is an analysis of data from the Huckleberry App, an app used by
parents to record information about their baby including sleep, feeding,
and growth.
Set Up
Overview of Data
raw <- read_csv("ronan.csv") %>%
dplyr::select(Type, Start, End, `Start Condition`, `Start Location`, `End Condition`, Notes) %>%
rename(
Start_Condition = `Start Condition`,
Start_Location = `Start Location`,
End_Condition = `End Condition`) %>%
mutate(
Start = strptime(Start, format = "%d/%m/%y %H:%M"),
End = strptime(End, format = "%d/%m/%y %H:%M"),
age_days = as.numeric(difftime(as.Date(Start), as.Date("2023-03-02"), units = "days")),
age_weeks = floor(age_days/7)
)
skim(raw)
| Name | raw |
| Number of rows | 3356 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Type | 0 | 1.00 | 4 | 6 | 0 | 7 | 0 |
| Start | 0 | 1.00 | 10089 | 26902 | 0 | 3294 | 0 |
| End | 1842 | 0.45 | 11934 | 19519 | 0 | 1513 | 0 |
| Start_Condition | 1596 | 0.52 | 3 | 24 | 0 | 108 | 0 |
| Start_Location | 2570 | 0.23 | 4 | 13 | 0 | 13 | 0 |
| End_Condition | 2490 | 0.26 | 4 | 27 | 0 | 73 | 0 |
| Notes | 3230 | 0.04 | 5 | 156 | 0 | 124 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age_days | 0 | 1 | 96.95 | 67.12 | 0 | 43 | 82 | 143 | 283 | ▇▇▃▃▁ |
| age_weeks | 0 | 1 | 13.42 | 9.58 | 0 | 6 | 11 | 20 | 40 | ▇▆▃▂▁ |
Columns with a complete rate <1 need to be inspected
further.
Inspect Column Types
end_table <- raw %>%
filter(is.na(End)) %>%
head(3)
sc <- raw %>%
filter(!is.na(Start_Condition)) %>%
head(3)
sl <- raw %>%
filter(!is.na(Start_Location)) %>%
head(3)
ec <- raw %>%
filter(!is.na(End_Condition)) %>%
head(3)
| Type | Start | End | Start_Condition | Start_Location | End_Condition | Notes | age_days | age_weeks |
|---|---|---|---|---|---|---|---|---|
| Pump | 2023-11-22 12:00:00 | NA | 10ml | NA | NA | NA | 265 | 37 |
| Growth | 2023-11-20 15:11:00 | NA | 9.02kg | 71.9cm | NA | NA | 263 | 37 |
| Pump | 2023-11-13 10:01:00 | NA | 40ml | NA | NA | NA | 256 | 36 |
| Type | Start | End | Start_Condition | Start_Location | End_Condition | Notes | age_days | age_weeks |
|---|---|---|---|---|---|---|---|---|
| Pump | 2023-11-22 12:00:00 | NA | 10ml | NA | NA | NA | 265 | 37 |
| Growth | 2023-11-20 15:11:00 | NA | 9.02kg | 71.9cm | NA | NA | 263 | 37 |
| Pump | 2023-11-13 10:01:00 | NA | 40ml | NA | NA | NA | 256 | 36 |
| Type | Start | End | Start_Condition | Start_Location | End_Condition | Notes | age_days | age_weeks |
|---|---|---|---|---|---|---|---|---|
| Growth | 2023-11-20 15:11:00 | NA | 9.02kg | 71.9cm | NA | NA | 263 | 37 |
| Growth | 2023-11-01 09:33:00 | NA | 9.02kg | 69cm | 45cm | NA | 244 | 34 |
| Feed | 2023-09-27 18:50:00 | NA | Breast Milk | Bottle | 60ml | NA | 209 | 29 |
| Type | Start | End | Start_Condition | Start_Location | End_Condition | Notes | age_days | age_weeks |
|---|---|---|---|---|---|---|---|---|
| Growth | 2023-11-01 09:33:00 | NA | 9.02kg | 69cm | 45cm | NA | 244 | 34 |
| Feed | 2023-09-27 18:50:00 | NA | Breast Milk | Bottle | 60ml | NA | 209 | 29 |
| Feed | 2023-09-27 12:47:00 | NA | Breast Milk | Bottle | 100ml | NA | 209 | 29 |
Start Condition, Start Location, and End condition seem to be used inconsistently depending on the Type so this will need to be assessed as the data is split into groups for analysis. The End only appears for events that have a distinct Start and End e.g. sleep so an incomplete column is acceptable.
Sleep
Longest and Shortest Sleeps
# Filter for rows recording sleep
sleep <- raw %>%
filter(Type == "Sleep" & !is.na(End)) %>%
dplyr::select(Start, End, age_weeks) %>%
mutate(diff_minutes = as.numeric(round(difftime(End, Start, units = "mins"))))
# Get longest periods of sleep
longest_sleep <- sleep %>%
arrange(desc(diff_minutes)) %>%
head(5) %>%
mutate(
Start = as.character(Start),
Length = round(diff_minutes/60, 1)
) %>%
dplyr::select(Start, Length) %>%
arrange(Start)
# Get shortest periods of sleep
shortest_sleep <- sleep %>%
arrange(diff_minutes) %>%
head(5) %>%
mutate(
Start = as.character(Start)) %>%
dplyr::select(Start, diff_minutes) %>%
arrange(Start)
kable(longest_sleep, caption = "Longest Periods of Sleep", col.names = c("Start", "Length (Hours)"))
| Start | Length (Hours) |
|---|---|
| 2023-06-11 18:40:00 | 12.7 |
| 2023-06-13 18:29:00 | 13.0 |
| 2023-08-10 19:07:00 | 12.4 |
| 2023-08-17 18:54:00 | 12.7 |
| 2023-08-26 19:19:00 | 12.5 |
kable(shortest_sleep, caption = "Shortest Periods of Sleep", col.names = c("Start", "Length (Minutes)"))
| Start | Length (Minutes) |
|---|---|
| 2023-04-19 15:05:00 | 1 |
| 2023-05-03 16:38:00 | 1 |
| 2023-05-05 18:54:00 | 1 |
| 2023-09-26 09:45:00 | 1 |
| 2023-10-11 07:05:00 | 1 |
There seem to have been some ‘false start’ sleeps of 1 minute. A more useful measure may be the frequency of the lowest sleep lengths.
shortest_sleep <- sleep %>%
group_by(diff_minutes) %>%
summarise(frequency = n()) %>%
arrange(diff_minutes) %>%
head(5)
kable(shortest_sleep, caption = "Shortest Periods of Sleep", col.names = c("Length (Minutes)", "Count"))
| Length (Minutes) | Count |
|---|---|
| 1 | 6 |
| 2 | 7 |
| 3 | 5 |
| 4 | 9 |
| 5 | 15 |
Hours of Sleep
Per Day
sleep_day <- raw %>%
filter(Type == "Sleep" & !is.na(End)) %>%
dplyr::select(Start, End, age_weeks) %>%
mutate(sleep_length = round(difftime(End, Start, units = "mins")),
date = as.Date(Start)) %>%
group_by(date) %>%
summarise(total_sleep = sum(sleep_length, na.rm = TRUE)) %>%
mutate(rolling_average = zoo::rollmean(total_sleep, k = 10, fill = NA)) %>%
dplyr::select(date, total_sleep, rolling_average)
p1 <- ggplot(sleep_day, aes(x = date)) +
geom_line(aes(y = total_sleep), size = 1, alpha = 0.9, color = "lightgrey") +
geom_line(aes(y = rolling_average, color = "10-Day Rolling Average"), size = 1, alpha = 1) +
labs(title = "Daily Sleep Over Time",
x = "Date",
y = "Minutes") +
scale_color_manual(values = brewer.pal(n = 2, name = "Set2")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
guides(
linetype = "none",
color = guide_legend(title = NULL),
fill = guide_legend(title = NULL)
) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month", expand = c(0, 0))
p1
Mum confirmed that they stopped recording overnight sleep in around September.
sleep_day_filtered <- sleep_day %>%
filter(date < as.Date("2023-09-01"))
p2 <- ggplot(sleep_day_filtered, aes(x = date)) +
geom_line(aes(y = total_sleep), size = 1, alpha = 0.9, color = "lightgrey") +
geom_line(aes(y = rolling_average, color = "10-Day Rolling Average"), size = 1, alpha = 1) +
labs(title = "Daily Sleep Over Time - Birth to 6 Months",
x = "Date",
y = "Minutes") +
scale_color_manual(values = brewer.pal(n = 2, name = "Set2")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
guides(
linetype = "none",
color = guide_legend(title = NULL),
fill = guide_legend(title = NULL)
) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month", expand = c(0, 0))
p2
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.
Feeds
Volume per Day
feeds <- raw %>%
filter(Type == "Feed") %>%
dplyr::select(Start, End_Condition) %>%
mutate(Volume = as.numeric(str_extract(End_Condition, "\\d+"))) %>%
rename(date = Start) %>%
dplyr::select(-End_Condition)
feeds_day <- feeds %>%
mutate(date = as.Date(date),
volume = as.numeric(Volume)) %>%
group_by(date) %>%
summarise(vol_day = sum(volume)) %>%
mutate(rolling_average = zoo::rollmean(vol_day, k = 10, fill = NA))
p5 <- ggplot(feeds_day, aes(x = date)) +
geom_line(aes(y = vol_day, color = "Volume"), size = 1, alpha = 0.7, linetype = "solid") +
geom_line(aes(y = rolling_average, color = "10-Day Rolling Average"), size = 1, alpha = 0.9) +
labs(title = "Daily Volume of Breast Milk Consumed",
x = "Date",
y = "Millilitres") +
scale_color_manual(values = c("10-Day Rolling Average" = "#66C2A5", "Volume" = "lightgrey")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate labels by 45 degrees
guides(
linetype = "none",
color = guide_legend(title = NULL),
fill = guide_legend(title = NULL)
) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month", expand = c(0, 0))
p5
Mum confirmed they stopped recording feeds midway through
June.
feeds_day_filtered <- feeds_day %>%
filter(date < as.Date("2023-07-01"))
p6 <- ggplot(feeds_day_filtered, aes(x = date)) +
geom_line(aes(y = vol_day, color = "Volume"), size = 1, alpha = 0.7, linetype = "solid") +
geom_line(aes(y = rolling_average, color = "10-Day Rolling Average"), size = 1, alpha = 0.9) +
labs(title = "Daily Volume of Milk Consumed - Birth to 3 Months",
y = "Millilitres") +
scale_color_manual(values = c("10-Day Rolling Average" = "#66C2A5", "Volume" = "lightgrey")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate labels by 45 degrees
guides(
linetype = "none",
color = guide_legend(title = NULL),
fill = guide_legend(title = NULL)
) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month", expand = c(0, 0))
p6
Pumping
Overall Pumping Statistics
pump <- raw %>%
filter(Type == "Pump") %>%
select(Start, Start_Condition) %>%
mutate(volume = as.numeric(str_extract(Start_Condition, "\\d+"))) %>%
select(Start, volume)
pump_statistics <- pump %>%
summarise(
total_pumps = n(),
total_volume = sum(volume, na.rm = TRUE)/1000,
mean_volume = round(mean(volume, na.rm = TRUE), 1),
median_volume = round(median(volume, na.rm = TRUE), 1),
min_volume = round(min(volume, na.rm = TRUE), 1),
max_volume = round(max(volume, na.rm = TRUE), 1),
sd_volume = round(sd(volume, na.rm = TRUE), 1)
)
kable(pump_statistics, caption = "Pumping Statistics", col.names = c("Total Pumps", "Total Volume (L)", "Mean Volume (mL)", "Median Volume (mL)", "Smallest Volume (mL)", "Largest Volume (mL)", "Standard Deviation (mL)"))
| Total Pumps | Total Volume (L) | Mean Volume (mL) | Median Volume (mL) | Smallest Volume (mL) | Largest Volume (mL) | Standard Deviation (mL) |
|---|---|---|---|---|---|---|
| 961 | 211.1 | 219.7 | 180 | 10 | 570 | 109.3 |
Total Daily Volume Pumped Statistics
pump_day <- raw %>%
filter(Type == "Pump") %>%
dplyr::select(Start, Start_Condition) %>%
mutate(date = as.Date(Start),
volume = as.numeric(str_extract(Start_Condition, "\\d+"))) %>%
dplyr::select(date, volume) %>%
group_by(date) %>%
summarise(total_volume = sum(volume)) %>%
mutate(rolling_average = zoo::rollmean(total_volume, k = 10, fill = NA))
pump_day_statistics <- pump_day %>%
summarise(
mean_volume = mean(total_volume, na.rm = TRUE),
median_volume = median(total_volume, na.rm = TRUE),
min_volume = min(total_volume, na.rm = TRUE),
max_volume = max(total_volume, na.rm = TRUE),
sd_volume = sd(total_volume, na.rm = TRUE)
)
kable(pump_day_statistics, caption = "Daily Pumping Statistics", col.names = c("Mean Volume (mL)", "Median Volume (mL)", "Smallest Volume (mL)", "Largest Volume (mL)", "Standard Deviation (mL)"))
| Mean Volume (mL) | Median Volume (mL) | Smallest Volume (mL) | Largest Volume (mL) | Standard Deviation (mL) |
|---|---|---|---|---|
| 872.314 | 930 | 10 | 1230 | 217.5086 |
Plot of Volume over Time
p7 <- ggplot(data = pump_day, aes(x = date)) +
geom_line(aes(y = total_volume, color = "Volume"), size = 1, alpha = 0.7) +
geom_line(aes(y = rolling_average, color = "10-Day Rolling Average"), size = 1, alpha = 0.9) +
labs(title = "Daily Volume of Breast Milk Pumped",
x = "Date",
y = "Millilitres") +
scale_color_manual(name = "Legend",
values = c("10-Day Rolling Average" = "#66C2A5", "Volume" = "lightgrey"),
labels = c("10-Day Rolling Average", "Volume")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(
linetype = "none",
color = guide_legend(title = NULL),
fill = guide_legend(title = NULL)
) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month", expand = c(0, 0))
p7
## Warning: Removed 9 rows containing missing values (`geom_line()`).
Distribution by Month
p8 <- pump_day %>%
mutate(month = month(date, label = TRUE)) %>%
ggplot(aes(x = month, y = total_volume, fill = month)) +
geom_boxplot() +
labs(title = "Pumped Milk Volumes by Month",
x = "Month",
y = "Volume") +
scale_fill_brewer(palette = "Pastel1") + # Choose a color palette
theme_minimal() +
theme(legend.position = "none")
p8
Mum confirmed that she stopped pumping by early November. This explains
the decrease in volume and change in distribution - while decreasing
supply, daily pumps throughout October decreased in total volume and
this trend continued into November until stopping. We can also see that
while establishing pumping in March (baby was born early March) volumes
were quite varied and the median was lower than in April.
Feeding & Pumping Comparison
pump <- pump %>%
mutate(date = as.Date(Start)) %>%
select(date, volume)
avg_pump <- pump %>%
group_by(date) %>%
summarise(count = n(),
vol = sum(volume)) %>%
filter(!is.na(count))
feeds <- feeds %>%
mutate(date = as.Date(date))
avg_feeds <- feeds %>%
group_by(date) %>%
summarise(count = n(),
Vol = sum(Volume)) %>%
filter(!is.na(count))
| Mean Pumps per Day | Mean Feeds per Day | Pump/Feed Ratio |
|---|---|---|
| 3.97 | 7.33 | 1.846348 |
The baby fed 80% more times than mum pumped.
mean_vol_pumped <- round(mean(pump$volume), 0)
mean_vol_fed <- round(mean(feeds$Volume), 0)
prop = round(mean_vol_pumped/mean_vol_fed, 2)
mean_vol_data <- data.frame("Mean Volume Pumped" = mean_vol_pumped, "Mean Volume Fed" = mean_vol_fed, "Pump/Feed Ratio" = prop)
kable(mean_vol_data, caption = "Feeding vs Pump Volumes", col.names = c("Mean Pumped Volume per Day", "Mean Volume Fed per Day", "Pump/Feed Ratio"))
| Mean Pumped Volume per Day | Mean Volume Fed per Day | Pump/Feed Ratio |
|---|---|---|
| 220 | 106 | 2.08 |
On average each pump produced more than twice as much milk as baby ate at each feed.
The density plot of number of pumps and number of feeds per day has quite wide spacing between the two. The feeds per day has an interesting multimodal distribution but still with an overall negative skew.
combined_plot <- bind_rows(
mutate(pump, source = "pump", volume = volume),
mutate(feeds, source = "feeds", volume = Volume)
)
ggplot(combined_plot, aes(x = volume, fill = source)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Feed and Pump Volumes",
x = "Volume",
y = "Density") +
scale_fill_manual(values = c("feeds" = "#66C2A5", "pump" = "lightgray"))
The density plot of volumes is quite different to the countse, with the
pump volumes having a bimodal distributon with the lower peak much
closer to the peak of the volume fed and significant overlap.
Pumping Linear Model
This model attempts to explain the variation in volume of milk pumped during each session.
A correlation matrix and VIF calculations are used to see if the predictors (baby’s age, minutes since last pump, hour of the day) are multicollinear.
Correlation Matrix
The correlation matrix identifies strong relationships between two predictor variables.
# data prep
pump_dist <- raw %>%
filter(Type == "Pump") %>%
dplyr::select(Start, Start_Condition) %>%
mutate(
volume = as.numeric(str_extract(Start_Condition, "\\d+")),
hour = hour(Start),
age_days = as.numeric((as.Date(Start)) - as.Date("2023-03-02"))
) %>%
dplyr::select(Start, volume, hour, age_days) %>%
arrange(Start) %>% # Arrange the data by timestamp
mutate(min_since_last_pump = as.numeric(difftime(Start, lag(Start), units = "mins"))) %>%
arrange(desc(min_since_last_pump)) %>%
slice(-c(1, 2)) %>%
# Normalize the variables
mutate(
volume_norm = scale(volume),
hour_norm = scale(hour),
age_days_norm = scale(age_days),
min_pump_norm = scale(min_since_last_pump)
)
# Normalisation
dist_norm <- pump_dist %>%
dplyr::select(volume_norm, hour_norm, age_days_norm, min_pump_norm) %>%
rename(vol = volume_norm,
hour = hour_norm,
age = age_days_norm,
mins = min_pump_norm) %>%
na.omit()
# Visualize correlation matrix (heatmap)
cor_matrix <- cor(dist_norm[, c("hour", "age", "mins")], use = "complete.obs")
cor_matrix
## hour age mins
## hour 1.0000000 -0.0631607 -0.5138272
## age -0.0631607 1.0000000 0.2990544
## mins -0.5138272 0.2990544 1.0000000
# Getting absolute values for visualisation
abs_cor_matrix <- abs(cor_matrix)
pheatmap(abs_cor_matrix, display_numbers = T, color = colorRampPalette(c('white','#66C2A5'))(100), fontsize_number = 15)
The correlation matrix suggests that there is a moderate relationship between hour of the day and minutes since last pump, and a small relationship between the age of the baby and minutes since last pump. Next I will calculate the VIF (Variance Inflation Factor) values to establish the degree of multicollinearity between them.
Variance Inflation Factor
model <- lm(vol ~ hour + age + mins, data = dist_norm)
# Calculate VIF
vif_values <- vif(model)
# Create a data frame with Variable and VIF columns
vif_df <- data.frame(Variable = names(vif_values), VIF = as.vector(vif_values))
# Display VIF values using knitr::kable
knitr::kable(vif_df, col.names = c("Variable", "VIF"))
| Variable | VIF |
|---|---|
| hour | 1.375541 |
| age | 1.111806 |
| mins | 1.504617 |
The low VIF values indicate that there is not a high level of correlation between these variables. This means there is no strong evidence to suggest the need for removing any of these variables due to multicollinearity despite the apparent correlation between the hour of the day and minutes since the last pump in the correlation matrix.
Linearity Test
Here I’m plotting the distribution of residuals for each of the explanatory variables to assess whether the linear model is appropriate.
Age seems to have a bimodal distribution of residuals so I’m going to exclude it from this model. The polynomial model of minutes since last pump has residuals that appear to be more noramlly distributed than the linear model so I will also use this in the final model.
Final Model
lm <- lm(vol ~ poly(mins, 2) + hour, data = dist_norm)
residuals <- residuals(lm)
ggplot(data.frame(residuals), aes(x = residuals)) +
geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency")
summary(lm)
##
## Call:
## lm(formula = vol ~ poly(mins, 2) + hour, data = dist_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.72712 -0.20733 0.01066 0.26070 2.10912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.987e-04 1.417e-02 0.049 0.961
## poly(mins, 2)1 1.707e+01 5.265e-01 32.420 <2e-16 ***
## poly(mins, 2)2 -1.502e+01 4.863e-01 -30.888 <2e-16 ***
## hour -2.320e-01 1.832e-02 -12.661 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4386 on 954 degrees of freedom
## Multiple R-squared: 0.8084, Adjusted R-squared: 0.8078
## F-statistic: 1342 on 3 and 954 DF, p-value: < 2.2e-16
The hour of the day and the minutes since the last pump explain about 81% of the variation in volume pumped in this linear model. The residuals are approximately normally distributed which supports this model being a good fit for the data. Age of the baby may also have some explanatory power, however the residuals were not normally distributed and thus has been excluded as beyond the scope of this project.
Growth
growth <- raw %>%
filter(Type == "Growth") %>%
select(Start, Start_Condition, Start_Location, End_Condition) %>%
rename(weight = Start_Condition,
length = Start_Location,
head = End_Condition) %>%
mutate(date = as.Date(Start),
weight = as.numeric(sub("kg", "", weight)),
head = as.numeric(sub("cm", "", head)),
length = as.numeric(sub("cm", "", length)),
age_days = as.numeric(difftime(date, "2023-03-02", units = "days")),
age_weeks = round(age_days / 7, 1),
age_days = round(age_days))
# Process the growth charts data
growth_charts <- read_excel("growth_chart.xlsx")%>%
rename(age_days = Age) %>%
select(age_days, P10, P25, P50, P75)
# Filter and select data for plotting
weight_growth <- growth %>%
filter(!is.na(weight)) %>%
select(weight, age_days)
# Define the color palette
colors <- brewer.pal(6, "Dark2")
# Plotting with adjusted x-axis limit and percentile labels
ggplot() +
geom_line(data = weight_growth,
aes(x = age_days, y = weight),
color = colors[3], linetype = "solid", size = 1.5, alpha = 0.8) + # Increased line width and decreased alpha
geom_line(data = growth_charts,
aes(x = age_days, y = P25),
color = colors[1], linetype = "dashed", size = 1, alpha = 0.6) + # Decreased line width and alpha
geom_line(data = growth_charts,
aes(x = age_days, y = P50),
color = colors[2], linetype = "dashed", size = 1, alpha = 0.6) + # Decreased line width and alpha
geom_line(data = growth_charts,
aes(x = age_days, y = P75),
color = colors[6], linetype = "dashed", size = 1, alpha = 0.6) + # Decreased line width and alpha
labs(title = "Weight by Age with Percentiles",
x = "Age (days)",
y = "Weight (kg)") +
theme_minimal() +
theme(legend.position = "bottom") +
scale_x_continuous(limits = c(0, 275), breaks = seq(0, 275, by = 20)) + # Adjusted x-axis limit
scale_y_continuous(limits = c(2.5, 10), breaks = seq(2.5, 10, by = 2)) +
scale_color_manual(values = c(colors[3], colors[1], colors[2], colors[6])) + # Match label text colors to line colors
geom_text(data = growth_charts %>% filter(age_days == 270),
aes(x = age_days, y = P25, label = "P25"),
vjust = -0.5, hjust = -0.5, color = colors[1]) +
geom_text(data = growth_charts %>% filter(age_days == 270),
aes(x = age_days, y = P50, label = "P50"),
vjust = -0.5, hjust = -0.5, color = colors[2]) +
geom_text(data = growth_charts %>% filter(age_days == 270),
aes(x = age_days, y = P75, label = "P75"),
vjust = -0.5, hjust = -0.5, color = colors[6])